From bb97d802086b4009a55fcf5dcae8af490f95ff67 Mon Sep 17 00:00:00 2001 From: Montana Low Date: Tue, 29 Aug 2023 17:45:46 -0700 Subject: [PATCH] organize python related modules --- pgml-extension/requirements.txt | 1 + pgml-extension/src/api.rs | 70 +++----------- .../src/bindings/{ => langchain}/langchain.py | 0 .../{langchain.rs => langchain/mod.rs} | 4 +- pgml-extension/src/bindings/mod.rs | 4 +- pgml-extension/src/bindings/python/mod.rs | 91 +++++++++++++++++++ .../bindings/{venv.py => python/python.py} | 0 .../bindings/{sklearn.rs => sklearn/mod.rs} | 17 ++-- .../src/bindings/{ => sklearn}/sklearn.py | 0 .../src/bindings/transformers/mod.rs | 12 +-- .../src/bindings/transformers/transformers.py | 2 + pgml-extension/src/bindings/venv.rs | 40 -------- pgml-extension/src/orm/model.rs | 2 +- pgml-extension/src/orm/task.rs | 4 +- 14 files changed, 127 insertions(+), 120 deletions(-) rename pgml-extension/src/bindings/{ => langchain}/langchain.py (100%) rename pgml-extension/src/bindings/{langchain.rs => langchain/mod.rs} (87%) create mode 100644 pgml-extension/src/bindings/python/mod.rs rename pgml-extension/src/bindings/{venv.py => python/python.py} (100%) rename pgml-extension/src/bindings/{sklearn.rs => sklearn/mod.rs} (97%) rename pgml-extension/src/bindings/{ => sklearn}/sklearn.py (100%) delete mode 100644 pgml-extension/src/bindings/venv.rs diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt index 3fdfeb4b7..db0c5d242 100644 --- a/pgml-extension/requirements.txt +++ b/pgml-extension/requirements.txt @@ -17,6 +17,7 @@ sacremoses==0.0.53 scikit-learn==1.3.0 sentencepiece==0.1.99 sentence-transformers==2.2.2 +tokenizers==0.13.3 torch==2.0.1 torchaudio==2.0.2 torchvision==0.15.2 diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index b560ee9d4..e1339d948 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -6,11 +6,9 @@ use pgrx::iter::{SetOfIterator, TableIterator}; use pgrx::*; #[cfg(feature = "python")] -use pyo3::prelude::*; use serde_json::json; #[cfg(feature = "python")] -use crate::bindings::sklearn::package_version; use crate::orm::*; macro_rules! unwrap_or_error { @@ -25,38 +23,13 @@ macro_rules! unwrap_or_error { #[cfg(feature = "python")] #[pg_extern] pub fn activate_venv(venv: &str) -> bool { - unwrap_or_error!(crate::bindings::venv::activate_venv(venv)) + unwrap_or_error!(crate::bindings::python::activate_venv(venv)) } #[cfg(feature = "python")] #[pg_extern(immutable, parallel_safe)] pub fn validate_python_dependencies() -> bool { - unwrap_or_error!(crate::bindings::venv::activate()); - - Python::with_gil(|py| { - let sys = PyModule::import(py, "sys").unwrap(); - let version: String = sys.getattr("version").unwrap().extract().unwrap(); - info!("Python version: {version}"); - for module in ["xgboost", "lightgbm", "numpy", "sklearn"] { - match py.import(module) { - Ok(_) => (), - Err(e) => { - panic!( - "The {module} package is missing. Install it with `sudo pip3 install {module}`\n{e}" - ); - } - } - } - }); - - let sklearn = unwrap_or_error!(package_version("sklearn")); - let xgboost = unwrap_or_error!(package_version("xgboost")); - let lightgbm = unwrap_or_error!(package_version("lightgbm")); - let numpy = unwrap_or_error!(package_version("numpy")); - - info!("Scikit-learn {sklearn}, XGBoost {xgboost}, LightGBM {lightgbm}, NumPy {numpy}",); - - true + unwrap_or_error!(crate::bindings::python::validate_dependencies()) } #[cfg(not(feature = "python"))] @@ -66,8 +39,7 @@ pub fn validate_python_dependencies() {} #[cfg(feature = "python")] #[pg_extern] pub fn python_package_version(name: &str) -> String { - unwrap_or_error!(crate::bindings::venv::activate()); - unwrap_or_error!(package_version(name)) + unwrap_or_error!(crate::bindings::python::package_version(name)) } #[cfg(not(feature = "python"))] @@ -79,13 +51,19 @@ pub fn python_package_version(name: &str) { #[cfg(feature = "python")] #[pg_extern] pub fn python_pip_freeze() -> TableIterator<'static, (name!(package, String),)> { - unwrap_or_error!(crate::bindings::venv::activate()); + unwrap_or_error!(crate::bindings::python::pip_freeze()) +} - let packages = unwrap_or_error!(crate::bindings::venv::freeze()) - .into_iter() - .map(|package| (package,)); +#[cfg(feature = "python")] +#[pg_extern] +fn python_version() -> String { + unwrap_or_error!(crate::bindings::python::version()) +} - TableIterator::new(packages) +#[cfg(not(feature = "python"))] +#[pg_extern] +pub fn python_version() -> String { + String::from("Python is not installed, recompile with `--features python`") } #[pg_extern] @@ -104,26 +82,6 @@ pub fn validate_shared_library() { } } -#[cfg(feature = "python")] -#[pg_extern] -fn python_version() -> String { - unwrap_or_error!(crate::bindings::venv::activate()); - let mut version = String::new(); - - Python::with_gil(|py| { - let sys = PyModule::import(py, "sys").unwrap(); - version = sys.getattr("version").unwrap().extract().unwrap(); - }); - - version -} - -#[cfg(not(feature = "python"))] -#[pg_extern] -pub fn python_version() -> String { - String::from("Python is not installed, recompile with `--features python`") -} - #[pg_extern(immutable, parallel_safe)] fn version() -> String { crate::VERSION.to_string() diff --git a/pgml-extension/src/bindings/langchain.py b/pgml-extension/src/bindings/langchain/langchain.py similarity index 100% rename from pgml-extension/src/bindings/langchain.py rename to pgml-extension/src/bindings/langchain/langchain.py diff --git a/pgml-extension/src/bindings/langchain.rs b/pgml-extension/src/bindings/langchain/mod.rs similarity index 87% rename from pgml-extension/src/bindings/langchain.rs rename to pgml-extension/src/bindings/langchain/mod.rs index 7ccaab954..00ee593fd 100644 --- a/pgml-extension/src/bindings/langchain.rs +++ b/pgml-extension/src/bindings/langchain/mod.rs @@ -6,10 +6,10 @@ use pyo3::types::PyTuple; use crate::{bindings::TracebackError, create_pymodule}; -create_pymodule!("/src/bindings/langchain.py"); +create_pymodule!("/src/bindings/langchain/langchain.py"); pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Result> { - crate::bindings::venv::activate()?; + crate::bindings::python::activate()?; let kwargs = serde_json::to_string(kwargs).unwrap(); diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs index 94a7668be..2da42b10f 100644 --- a/pgml-extension/src/bindings/mod.rs +++ b/pgml-extension/src/bindings/mod.rs @@ -38,11 +38,11 @@ pub mod langchain; pub mod lightgbm; pub mod linfa; #[cfg(feature = "python")] +pub mod python; +#[cfg(feature = "python")] pub mod sklearn; #[cfg(feature = "python")] pub mod transformers; -#[cfg(feature = "python")] -pub mod venv; pub mod xgboost; pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Result>; diff --git a/pgml-extension/src/bindings/python/mod.rs b/pgml-extension/src/bindings/python/mod.rs new file mode 100644 index 000000000..7f527b0fc --- /dev/null +++ b/pgml-extension/src/bindings/python/mod.rs @@ -0,0 +1,91 @@ +//! Use virtualenv. + +use anyhow::Result; +use once_cell::sync::Lazy; +use pgrx::iter::TableIterator; +use pgrx::*; +use pyo3::prelude::*; +use pyo3::types::PyTuple; + +use crate::config::get_config; +use crate::{bindings::TracebackError, create_pymodule}; + +static CONFIG_NAME: &str = "pgml.venv"; + +create_pymodule!("/src/bindings/python/python.py"); + +pub fn activate_venv(venv: &str) -> Result { + Python::with_gil(|py| { + let activate_venv: Py = get_module!(PY_MODULE).getattr(py, "activate_venv")?; + let result: Py = + activate_venv.call1(py, PyTuple::new(py, &[venv.to_string().into_py(py)]))?; + + Ok(result.extract(py)?) + }) +} + +pub fn activate() -> Result { + match get_config(CONFIG_NAME) { + Some(venv) => activate_venv(&venv), + None => Ok(false), + } +} + +pub fn pip_freeze() -> Result> { + activate()?; + let packages = Python::with_gil(|py| -> Result> { + let freeze = get_module!(PY_MODULE).getattr(py, "freeze")?; + let result = freeze.call0(py)?; + + Ok(result.extract(py)?) + })?; + + Ok(TableIterator::new( + packages.into_iter().map(|package| (package,)), + )) +} + +pub fn validate_dependencies() -> Result { + activate()?; + Python::with_gil(|py| { + let sys = PyModule::import(py, "sys").unwrap(); + let version: String = sys.getattr("version").unwrap().extract().unwrap(); + info!("Python version: {version}"); + for module in ["xgboost", "lightgbm", "numpy", "sklearn"] { + match py.import(module) { + Ok(_) => (), + Err(e) => { + panic!( + "The {module} package is missing. Install it with `sudo pip3 install {module}`\n{e}" + ); + } + } + } + }); + + let sklearn = package_version("sklearn")?; + let xgboost = package_version("xgboost")?; + let lightgbm = package_version("lightgbm")?; + let numpy = package_version("numpy")?; + + info!("Scikit-learn {sklearn}, XGBoost {xgboost}, LightGBM {lightgbm}, NumPy {numpy}",); + + Ok(true) +} + +pub fn version() -> Result { + activate()?; + Python::with_gil(|py| { + let sys = PyModule::import(py, "sys").unwrap(); + let version: String = sys.getattr("version").unwrap().extract().unwrap(); + Ok(version) + }) +} + +pub fn package_version(name: &str) -> Result { + activate()?; + Python::with_gil(|py| { + let package = py.import(name)?; + Ok(package.getattr("__version__")?.extract()?) + }) +} diff --git a/pgml-extension/src/bindings/venv.py b/pgml-extension/src/bindings/python/python.py similarity index 100% rename from pgml-extension/src/bindings/venv.py rename to pgml-extension/src/bindings/python/python.py diff --git a/pgml-extension/src/bindings/sklearn.rs b/pgml-extension/src/bindings/sklearn/mod.rs similarity index 97% rename from pgml-extension/src/bindings/sklearn.rs rename to pgml-extension/src/bindings/sklearn/mod.rs index 99e9cfe78..05e85d97c 100644 --- a/pgml-extension/src/bindings/sklearn.rs +++ b/pgml-extension/src/bindings/sklearn/mod.rs @@ -15,11 +15,13 @@ use once_cell::sync::Lazy; use pyo3::prelude::*; use pyo3::types::PyTuple; -use crate::bindings::Bindings; +use crate::{ + bindings::{Bindings, TracebackError}, + create_pymodule, + orm::*, +}; -use crate::{bindings::TracebackError, create_pymodule, orm::*}; - -create_pymodule!("/src/bindings/sklearn.py"); +create_pymodule!("/src/bindings/sklearn/sklearn.py"); macro_rules! wrap_fit { ($fn_name:tt, $task:literal) => { @@ -355,10 +357,3 @@ pub fn cluster_metrics( Ok(scores) }) } - -pub fn package_version(name: &str) -> Result { - Python::with_gil(|py| { - let package = py.import(name)?; - Ok(package.getattr("__version__")?.extract()?) - }) -} diff --git a/pgml-extension/src/bindings/sklearn.py b/pgml-extension/src/bindings/sklearn/sklearn.py similarity index 100% rename from pgml-extension/src/bindings/sklearn.py rename to pgml-extension/src/bindings/sklearn/sklearn.py diff --git a/pgml-extension/src/bindings/transformers/mod.rs b/pgml-extension/src/bindings/transformers/mod.rs index 7621a2b3e..91158f860 100644 --- a/pgml-extension/src/bindings/transformers/mod.rs +++ b/pgml-extension/src/bindings/transformers/mod.rs @@ -24,7 +24,7 @@ pub fn transform( args: &serde_json::Value, inputs: Vec<&str>, ) -> Result { - crate::bindings::venv::activate()?; + crate::bindings::python::activate()?; whitelist::verify_task(task)?; @@ -70,7 +70,7 @@ pub fn embed( inputs: Vec<&str>, kwargs: &serde_json::Value, ) -> Result>> { - crate::bindings::venv::activate()?; + crate::bindings::python::activate()?; let kwargs = serde_json::to_string(kwargs)?; Python::with_gil(|py| -> Result>> { @@ -101,7 +101,7 @@ pub fn tune( hyperparams: &JsonB, path: &Path, ) -> Result> { - crate::bindings::venv::activate()?; + crate::bindings::python::activate()?; let task = task.to_string(); let hyperparams = serde_json::to_string(&hyperparams.0)?; @@ -131,7 +131,7 @@ pub fn tune( } pub fn generate(model_id: i64, inputs: Vec<&str>, config: JsonB) -> Result> { - crate::bindings::venv::activate()?; + crate::bindings::python::activate()?; Python::with_gil(|py| -> Result> { let generate = get_module!(PY_MODULE) @@ -219,7 +219,7 @@ pub fn load_dataset( limit: Option, kwargs: &serde_json::Value, ) -> Result { - crate::bindings::venv::activate()?; + crate::bindings::python::activate()?; let kwargs = serde_json::to_string(kwargs)?; @@ -376,7 +376,7 @@ pub fn load_dataset( } pub fn clear_gpu_cache(memory_usage: Option) -> Result { - crate::bindings::venv::activate().unwrap(); + crate::bindings::python::activate().unwrap(); Python::with_gil(|py| -> Result { let clear_gpu_cache: Py = get_module!(PY_MODULE) diff --git a/pgml-extension/src/bindings/transformers/transformers.py b/pgml-extension/src/bindings/transformers/transformers.py index 0359085f5..af948e9ef 100644 --- a/pgml-extension/src/bindings/transformers/transformers.py +++ b/pgml-extension/src/bindings/transformers/transformers.py @@ -34,6 +34,8 @@ DataCollatorWithPadding, DefaultDataCollator, GenerationConfig, + PegasusForConditionalGeneration, + PegasusTokenizer, TrainingArguments, Trainer, ) diff --git a/pgml-extension/src/bindings/venv.rs b/pgml-extension/src/bindings/venv.rs deleted file mode 100644 index 458803a08..000000000 --- a/pgml-extension/src/bindings/venv.rs +++ /dev/null @@ -1,40 +0,0 @@ -//! Use virtualenv. - -use anyhow::Result; -use once_cell::sync::Lazy; -use pgrx::*; -use pyo3::prelude::*; -use pyo3::types::PyTuple; - -use crate::config::get_config; -use crate::{bindings::TracebackError, create_pymodule}; - -static CONFIG_NAME: &str = "pgml.venv"; - -create_pymodule!("/src/bindings/venv.py"); - -pub fn activate_venv(venv: &str) -> Result { - Python::with_gil(|py| { - let activate_venv: Py = get_module!(PY_MODULE).getattr(py, "activate_venv")?; - let result: Py = - activate_venv.call1(py, PyTuple::new(py, &[venv.to_string().into_py(py)]))?; - - Ok(result.extract(py)?) - }) -} - -pub fn activate() -> Result { - match get_config(CONFIG_NAME) { - Some(venv) => activate_venv(&venv), - None => Ok(false), - } -} - -pub fn freeze() -> Result> { - Python::with_gil(|py| { - let freeze = get_module!(PY_MODULE).getattr(py, "freeze")?; - let result = freeze.call0(py)?; - - Ok(result.extract(py)?) - }) -} diff --git a/pgml-extension/src/orm/model.rs b/pgml-extension/src/orm/model.rs index 46ef34821..d847d934d 100644 --- a/pgml-extension/src/orm/model.rs +++ b/pgml-extension/src/orm/model.rs @@ -89,7 +89,7 @@ impl Model { }; if runtime == Runtime::python { - crate::bindings::venv::activate().unwrap(); + crate::bindings::python::activate().unwrap(); } let dataset = snapshot.tabular_dataset(); diff --git a/pgml-extension/src/orm/task.rs b/pgml-extension/src/orm/task.rs index a5b47ea88..f0fe6b02f 100644 --- a/pgml-extension/src/orm/task.rs +++ b/pgml-extension/src/orm/task.rs @@ -48,7 +48,7 @@ impl Task { Task::text_generation => "perplexity", Task::text2text => "perplexity", Task::cluster => "silhouette", - Task::embedding => error!("No default target metric for embedding task") + Task::embedding => error!("No default target metric for embedding task"), } .to_string() } @@ -64,7 +64,7 @@ impl Task { Task::text_generation => false, Task::text2text => false, Task::cluster => true, - Task::embedding => error!("No default target metric positive for embedding task") + Task::embedding => error!("No default target metric positive for embedding task"), } }