text pair classification task support

postgresml · santiatpml · Mar 26, 2024 · Jan 31, 2024 · Feb 1, 2024 · Feb 6, 2024
commit 82cb4f7af4706c4f06a694305ca5a9d83bcd60a3
diff --git a/pgml-extension/src/bindings/transformers/mod.rs b/pgml-extension/src/bindings/transformers/mod.rs
@@ -10,7 +10,7 @@ use pyo3::types::PyTuple;
 use serde_json::Value;
 
 use crate::create_pymodule;
-use crate::orm::{Task, TextClassificationDataset};
+use crate::orm::{Task, TextClassificationDataset, TextPairClassificationDataset};
 
 use super::TracebackError;
 
@@ -106,6 +106,35 @@ pub fn finetune_text_classification(task: &Task, dataset: TextClassificationData
         output.extract(py).format_traceback(py)
     })
 }
+
+pub fn finetune_text_pair_classification(task: &Task, dataset: TextPairClassificationDataset, hyperparams: &JsonB, path: &Path) -> Result<HashMap<String, f64>> {
+    let task = task.to_string();
+    let hyperparams = serde_json::to_string(&hyperparams.0)?;
+
+    Python::with_gil(|py| -> Result<HashMap<String, f64>> {
+        let tune = get_module!(PY_MODULE).getattr(py, "finetune_text_pair_classification").format_traceback(py)?;
+        let path = path.to_string_lossy();
+        let output = tune
+            .call1(
+                py,
+                (
+                    &task,
+                    &hyperparams,
+                    path.as_ref(),
+                    dataset.text1_train,
+                    dataset.text1_test,
+                    dataset.text2_train,
+                    dataset.text2_test,
+                    dataset.class_train,
+                    dataset.class_test,
+                ),
+            )
+            .format_traceback(py)?;
+
+        output.extract(py).format_traceback(py)
+    })
+}
+
 pub fn generate(model_id: i64, inputs: Vec<&str>, config: JsonB) -> Result<Vec<String>> {
     Python::with_gil(|py| -> Result<Vec<String>> {
         let generate = get_module!(PY_MODULE).getattr(py, "generate").format_traceback(py)?;

diff --git a/pgml-extension/src/bindings/transformers/transformers.py b/pgml-extension/src/bindings/transformers/transformers.py
@@ -1084,6 +1084,96 @@ def tokenize_function(example):
         log.error(e)
     log.info("Training started")
 
+    # Train
+    trainer.train()
+    metrics = {"loss" : 0.0}
+    return metrics
+
+def finetune_text_pair_classification(task, hyperparams, path, text1_train, text1_test, text2_train, text2_test, class_train, class_test):
+    # Get model and tokenizer
+    hyperparams = orjson.loads(hyperparams)
+    model_name = hyperparams.pop("model_name")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    classes = list(set(class_train))
+    num_classes = len(classes)
+
+    id2label = {}
+    label2id = {}
+    for id, label in enumerate(classes):
+        label2id[label] = id
+        id2label[id] = label
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name, num_labels=num_classes, id2label=id2label, label2id=label2id
+    )
+
+    model.config.id2label = id2label
+    model.config.label2id = label2id
+
+    y_train_label = [label2id[_class] for _class in class_train]
+    y_test_label = [label2id[_class] for _class in class_test]
+
+    # Prepare dataset
+    train_dataset = datasets.Dataset.from_dict(
+        {
+            "text1": text1_train,
+            "text2" : text2_train,
+            "label": y_train_label,
+        }
+    )
+    test_dataset = datasets.Dataset.from_dict(
+        {
+            "text1": text1_test,
+            "text2": text2_test,
+            "label": y_test_label,
+        }
+    )
+    # tokenization function
+    def tokenize_function(example):
+        tokenized_example = tokenizer(
+            example["text1"],
+            example["text2"],
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+        return tokenized_example
+
+    # Generate tokens
+    train_tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
+    test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
+    log.info("Tokenization done")
+    log.info("Train dataset")
+    log.info(train_tokenized_datasets[0:2])
+    log.info("Test dataset")
+    log.info(test_tokenized_datasets[0:2])
+    # Data collator
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # Training Args
+    log.info("Training args setup started path=%s"%path)
+    training_args=TrainingArguments(output_dir="/tmp/postgresml/models/", logging_dir="/tmp/postgresml/runs", **hyperparams["training_args"])
+    log.info("Trainer setup done")
+    # Trainer
+    log.info(model)
+    log.info(training_args)
+    log.info(train_tokenized_datasets)
+    log.info(test_tokenized_datasets)
+    log.info(tokenizer)
+    log.info(data_collator)
+    try:
+        trainer = Trainer(
+            model=model.to("cpu"),
+            args=training_args,
+            train_dataset=train_tokenized_datasets,
+            eval_dataset=test_tokenized_datasets,
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+        )
+    except Exception as e:
+        log.error(e)
+    log.info("Training started")
+
     # Train
     trainer.train()
     metrics = {"loss" : 0.0}

diff --git a/pgml-extension/src/orm/dataset.rs b/pgml-extension/src/orm/dataset.rs
@@ -68,6 +68,20 @@ impl Dataset {
     }
 }
 
+pub enum TextDatasetType {
+    TextClassification(TextClassificationDataset),
+    TextPairClassification(TextPairClassificationDataset),
+}
+
+impl TextDatasetType {
+    pub fn num_features(&self) -> usize {
+        match self {
+            TextDatasetType::TextClassification(dataset) => dataset.num_features,
+            TextDatasetType::TextPairClassification(dataset) => dataset.num_features,
+        }
+    }
+}
+
 // TextClassificationDataset
 pub struct TextClassificationDataset {
     pub text_train: Vec<String>,
@@ -86,24 +100,38 @@ impl Display for TextClassificationDataset {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
         write!(
             f,
-            "TextClassificationDataset {{ num_features: {}, num_labels: {}, num_distinct_labels: {}, num_rows: {}, num_train_rows: {}, num_test_rows: {} }}",
-            self.num_features, self.num_labels, self.num_distinct_labels, self.num_rows, self.num_train_rows, self.num_test_rows,
+            "TextClassificationDataset {{ num_distinct_labels: {}, num_rows: {}, num_train_rows: {}, num_test_rows: {} }}",
+            self.num_distinct_labels, self.num_rows, self.num_train_rows, self.num_test_rows,
         )
     }
 }
 
-pub enum TextDatasetType {
-    TextClassification(TextClassificationDataset),
+pub struct TextPairClassificationDataset {
+    pub text1_train: Vec<String>,
+    pub text2_train: Vec<String>,
+    pub class_train: Vec<String>,
+    pub text1_test: Vec<String>,
+    pub text2_test: Vec<String>,
+    pub class_test: Vec<String>,
+    pub num_features: usize,
+    pub num_labels: usize,
+    pub num_rows: usize,
+    pub num_train_rows: usize,
+    pub num_test_rows: usize,
+    pub num_distinct_labels: usize,
 }
 
-impl TextDatasetType {
-    pub fn num_features(&self) -> usize {
-        match self {
-            TextDatasetType::TextClassification(dataset) => dataset.num_features,
-        }
+impl Display for TextPairClassificationDataset {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(
+            f,
+            "TextPairClassificationDataset {{ num_distinct_labels: {}, num_rows: {}, num_train_rows: {}, num_test_rows: {} }}",
+            self.num_distinct_labels, self.num_rows, self.num_train_rows, self.num_test_rows,
+        )
     }
 }
 
+
 fn drop_table_if_exists(table_name: &str) {
     // Avoid the existence for DROP TABLE IF EXISTS warning by checking the schema for the table first
     let table_count = Spi::get_one_with_args::<i64>(

diff --git a/pgml-extension/src/orm/mod.rs b/pgml-extension/src/orm/mod.rs
@@ -15,6 +15,7 @@ pub use algorithm::Algorithm;
 pub use dataset::Dataset;
 pub use dataset::TextDatasetType;
 pub use dataset::TextClassificationDataset;
+pub use dataset::TextPairClassificationDataset;
 pub use model::Model;
 pub use project::Project;
 pub use runtime::Runtime;

diff --git a/pgml-extension/src/orm/model.rs b/pgml-extension/src/orm/model.rs
@@ -167,8 +167,10 @@ impl Model {
         // let dataset = snapshot.text_classification_dataset(dataset_args);
         let dataset = if project.task == Task::text_classification {
             TextDatasetType::TextClassification(snapshot.text_classification_dataset(dataset_args))
+        } else if project.task == Task::text_pair_classification {
+            TextDatasetType::TextPairClassification(snapshot.text_pair_classification_dataset(dataset_args))
         } else {
-            TextDatasetType::TextClassification(snapshot.text_classification_dataset(dataset_args))
+            panic!("Unsupported task for finetuning")
         };
 
         // Create the model record.
@@ -229,6 +231,13 @@ impl Model {
                 };
 
             }
+            TextDatasetType::TextPairClassification(dataset) => {
+                metrics = match transformers::finetune_text_pair_classification(&project.task, dataset, &model.hyperparams, &path) {
+                Ok(metrics) => metrics,
+                Err(e) => error!("{e}"),
+                };
+
+            }
         };
 
         model.metrics = Some(JsonB(json!(metrics)));