From 503dc7142f49e9dbd471134dcb7b9d6e3e64bc90 Mon Sep 17 00:00:00 2001 From: ns1000 Date: Thu, 23 Nov 2023 15:32:58 +0100 Subject: [PATCH 1/2] Added pyarrow==11.0.0 to requirements to solve issue where postgres would segfault after a client session which used pgml command closes. The issue can be identified in postgres log files with the line 'arrow::fs::FinalizeS3 was not called even though S3 was initialized. This could lead to a segmentation fault at exit' --- pgml-extension/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt index 235c726ef..ba86e28f5 100644 --- a/pgml-extension/requirements.txt +++ b/pgml-extension/requirements.txt @@ -29,3 +29,4 @@ pynvml==11.5.0 transformers-stream-generator==0.0.4 optimum==1.13.2 peft==0.6.2 +pyarrow==11.0.0 From 34628687d15b1e5e3ceb6bbdd70861b2def65154 Mon Sep 17 00:00:00 2001 From: ns1000 Date: Sat, 25 Nov 2023 04:59:40 +0100 Subject: [PATCH 2/2] Added embed_batch2, which will batch process multiple strings and then return the embeddings as a table --- pgml-extension/src/api.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index bb97b31e8..224f21caa 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -558,6 +558,26 @@ pub fn embed_batch( } } +#[cfg(all(feature = "python", not(feature = "use_as_lib")))] +#[pg_extern(immutable, parallel_safe, name = "embed2")] +pub fn embed_batch2<'a>( + transformer: &str, + inputs: Vec<&'a str>, + kwargs: default!(JsonB, "'{}'"), +) -> TableIterator<'a, (name!(text, String), name!(embedding, Vec))> { + let rows = match crate::bindings::transformers::embed(transformer, inputs.clone(), &kwargs.0) { + Ok(rows) => rows, + Err(e) => { + error!("{e}"); + } + }; + TableIterator::new( + inputs.into_iter().zip(rows.into_iter()).map(|(text, embedding)| { + (text.to_string(), embedding) + }), + ) +} + /// Clears the GPU cache. /// /// # Arguments