From 97398a2e1dc2c0e5d4ef6606d619645a1323d848 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Tue, 4 Jun 2024 16:46:55 -0700 Subject: [PATCH 1/4] Periodic commit --- pgml-cms/docs/api/client-sdk/README.md | 159 ++++++++++- pgml-cms/docs/api/client-sdk/collections.md | 233 +++++++++++++++++ pgml-cms/docs/api/client-sdk/pipelines.md | 196 +++++++++++++- pgml-cms/docs/api/client-sdk/search.md | 275 ++++++++++++++++++-- 4 files changed, 836 insertions(+), 27 deletions(-) diff --git a/pgml-cms/docs/api/client-sdk/README.md b/pgml-cms/docs/api/client-sdk/README.md index 866610b92..0ccddb9f0 100644 --- a/pgml-cms/docs/api/client-sdk/README.md +++ b/pgml-cms/docs/api/client-sdk/README.md @@ -12,17 +12,39 @@ The client SDK can be installed using standard package managers for JavaScript, Installing the SDK into your project is as simple as: {% tabs %} -{% tab title="JavaScript " %} +{% tab title="JavaScript" %} ```bash npm i pgml ``` {% endtab %} -{% tab title="Python " %} +{% tab title="Python" %} ```bash pip install pgml ``` {% endtab %} + +{% tab title="Rust" %} +```bash +cargo add pgml +``` +{% endtab %} + +{% tab title="C" %} + +First clone the `postgresml` repository and navigate to the `pgml-sdks/pgml/c` directory: +```bash +git clone https://github.com/postgresml/postgresml +cd postgresml/pgml-sdks/pgml/c +``` + +Then build the bindings +```bash +make bindings +``` + +This will generate the `pgml.h` file and a `.so` on linux and `.dyblib` on MacOS. +{% endtab %} {% endtabs %} ## Getting started @@ -41,10 +63,10 @@ export PGML_DATABASE_URL=postgres://user:password@sql.cloud.postgresml.org:6432/ ### Create a collection -The SDK is written in asynchronous code, so you need to run it inside an async runtime. Both Python and JavaScript support async functions natively. +The SDK is written in asynchronous code, so you need to run it inside an async runtime. Both Python, JavaScript and Rust support async functions natively. {% tabs %} -{% tab title="JavaScript " %} +{% tab title="JavaScript" %} ```javascript const pgml = require("pgml"); @@ -63,6 +85,28 @@ async def main(): collection = Collection("sample_collection") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +use pgml::{Collection, Pipeline}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let mut collection = Collection::new("sample_collection", None)?; +} +``` +{% endtab %} + +{% tab title="C" %} +```c +#include +#include "pgml.h" + +int main() { + CollectionC * collection = pgml_collectionc_new("sample_collection", NULL); +} +``` +{% endtab %} {% endtabs %} The above example imports the `pgml` module and creates a collection object. By itself, the collection only tracks document contents and identifiers, but once we add a pipeline, we can instruct the SDK to perform additional tasks when documents and are inserted and retrieved. @@ -93,7 +137,7 @@ await collection.add_pipeline(pipeline); ```python # Add this code to the end of the main function from the above example. pipeline = Pipeline( - "test_pipeline", + "sample_pipeline", { "text": { "splitter": { "model": "recursive_character" }, @@ -107,6 +151,37 @@ pipeline = Pipeline( await collection.add_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +// Add this code to the end of the main function from the above example. +let mut pipeline = Pipeline::new( + "sample_pipeline", + Some( + serde_json::json!({ + "text": { + "splitter": { "model": "recursive_character" }, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + }, + }, + }) + .into(), + ), +)?; + +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +// Add this code to the end of the main function from the above example. +PipelineC * pipeline = pgml_pipelinec_new("sample_pipeline", "{\"text\": {\"splitter\": {\"model\": \"recursive_character\"},\"semantic_search\": {\"model\": \"Alibaba-NLP/gte-base-en-v1.5\"}}}"); + +pgml_collectionc_add_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} The pipeline configuration is a key/value object, where the key is the name of a column in a document, and the value is the action the SDK should perform on that column. @@ -153,9 +228,36 @@ documents = [ await collection.upsert_documents(documents) ``` {% endtab %} -{% endtabs %} -If the same document `id` is used, the SDK computes the difference between existing and new documents and only updates the chunks that have changed. +{% tab title="Rust" %} +```rust +// Add this code to the end of the main function in the above example. +let documents = vec![ + serde_json::json!({ + "id": "Document One", + "text": "document one contents...", + }) + .into(), + serde_json::json!({ + "id": "Document Two", + "text": "document two contents...", + }) + .into(), +]; + +collection.upsert_documents(documents, None).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +// Add this code to the end of the main function in the above example. +char * documents_to_upsert[2] = {"{\"id\": \"Document One\", \"text\": \"document one contents...\"}", "{\"id\": \"Document Two\", \"text\": \"document two contents...\"}"}; + +pgml_collectionc_upsert_documents(collection, documents_to_upsert, 2, NULL); +``` +{% endtab %} +{% endtabs %} ### Search documents @@ -203,6 +305,47 @@ results = await collection.vector_search( print(results) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +// Add this code to the end of the main function in the above example. +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "text": { + "query": "Something about a document...", + }, + }, + }, + "limit": 2, + }) + .into(), + &mut pipeline, + ) + .await?; + +println!("{:?}", results); + +Ok(()) +``` +{% endtab %} + +{% tab title="C" %} +```c +// Add this code to the end of the main function in the above example. +r_size = 0; +char** results = pgml_collectionc_vector_search(collection, "{\"query\": {\"fields\": {\"text\": {\"query\": \"Something about a document...\"}}}, \"limit\": 2}", pipeline, &r_size); +printf("\n\nPrinting results:\n"); +for (i = 0; i < r_size; ++i) { + printf("Result %u -> %s\n", i, results[i]); +} + +pgml_pipelinec_delete(pipeline); +pgml_collectionc_delete(collection); +``` +{% endtab %} {% endtabs %} We are using built-in vector search, powered by embeddings and the PostgresML [pgml.embed()](../sql-extension/pgml.embed) function, which embeds the `query` argument, compares it to the embeddings stored in the database, and returns the top two results, ranked by cosine similarity. @@ -228,6 +371,8 @@ if __name__ == "__main__": {% endtab %} {% endtabs %} +Note that `Rust` and `C` example do not require any additional code to run correctly. + Once you run the example, you should see something like this in the terminal: ```bash diff --git a/pgml-cms/docs/api/client-sdk/collections.md b/pgml-cms/docs/api/client-sdk/collections.md index 14c64ad5c..ebd63afca 100644 --- a/pgml-cms/docs/api/client-sdk/collections.md +++ b/pgml-cms/docs/api/client-sdk/collections.md @@ -26,6 +26,18 @@ const collection = pgml.newCollection("test_collection") collection = Collection("test_collection") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +``` +{% endtab %} {% endtabs %} ### Custom `PGML_DATABASE_URL` @@ -44,6 +56,18 @@ const collection = pgml.newCollection("test_collection", CUSTOM_DATABASE_URL) collection = Collection("test_collection", CUSTOM_DATABASE_URL) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", Some(CUSTOM_DATABASE_URL))?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", CUSTOM_DATABASE_URL); +``` +{% endtab %} {% endtabs %} ## Upserting Documents @@ -90,6 +114,38 @@ documents = [ await collection.upsert_documents(documents) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents: Vec = vec![ + serde_json::json!({ + "id": "document_one", + "title": "Document One", + "text": "Here are the contents of Document 1", + "random_key": "here is some random data", + }) + .into(), + serde_json::json!({ + "id": "document_two", + "title": "Document Two", + "text": "Here are the contents of Document 2", + "random_key": "here is some random data", + }) + .into(), +]; +collection.upsert_documents(documents, None).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * documents[2] = { + "{\"id\": \"document_one\", \"title\": \"Document One\", \"text\": \"Here are the contents of Document 1\", \"random_key\": \"here is some random data\"}", + "{\"id\": \"document_two\", \"title\": \"Document Two\", \"text\": \"Here are the contents of Document 2\", \"random_key\": \"here is some random data\"}" +}; +pgml_collectionc_upsert_documents(collection, documents, 2, NULL); +``` +{% endtab %} {% endtabs %} Documents can be replaced by upserting documents with the same `id`. @@ -134,6 +190,38 @@ documents = [ await collection.upsert_documents(documents) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents: Vec = vec![ + serde_json::json!({ + "id": "document_one", + "title": "Document One", + "text": "Here is some new text for document one", + "random_key": "here is some random data", + }) + .into(), + serde_json::json!({ + "id": "document_two", + "title": "Document Two", + "text": "Here is some new text for document two", + "random_key": "here is some random data", + }) + .into(), +]; +collection.upsert_documents(documents, None).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * documents[2] = { + "{\"id\": \"document_one\", \"title\": \"Document One\", \"text\": \"Here is some new text for document one\", \"random_key\": \"here is some random data\"}", + "{\"id\": \"document_two\", \"title\": \"Document Two\", \"text\": \"Here is some new text for document two\", \"random_key\": \"here is some random data\"}" +}; +pgml_collectionc_upsert_documents(collection, documents, 2, NULL); +``` +{% endtab %} {% endtabs %} Documents can be merged by setting the `merge` option. On conflict, new document keys will override old document keys. @@ -176,6 +264,38 @@ documents = [ await collection.upsert_documents(documents, {"merge": True}) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents: Vec = vec![ + serde_json::json!({ + "id": "document_one", + "new_key": "this will be a new key in document one", + "random_key": "this will replace old random_key" + }) + .into(), + serde_json::json!({ + "id": "document_two", + "new_key": "this will be a new key in document two", + "random_key": "this will replace old random_key" + }) + .into(), +]; +collection + .upsert_documents(documents, Some(serde_json::json!({"merge": true}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * documents[2] = { + "{\"id\": \"document_one\", \"new_key\": \"this will be a new key in document one\", \"random_key\": \"this will replace old random_key\"}", + "{\"id\": \"document_two\", \"new_key\": \"this will be a new key in document two\", \"random_key\": \"this will replace old random_key\"}" +}; +pgml_collectionc_upsert_documents(collection, documents, 2, "{\"merge\": true}"); +``` +{% endtab %} {% endtabs %} ## Getting Documents @@ -194,6 +314,21 @@ const documents = await collection.get_documents({limit: 100 }) documents = await collection.get_documents({ "limit": 100 }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some(serde_json::json!({"limit": 100}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100}", &r_size); +``` +{% endtab %} {% endtabs %} ### Paginating Documents @@ -214,6 +349,21 @@ const documents = await collection.get_documents({ limit: 100, offset: 10 }) documents = await collection.get_documents({ "limit": 100, "offset": 10 }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some(serde_json::json!({"limit": 100, "offset": 10}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"offset\": 10}", &r_size); +``` +{% endtab %} {% endtabs %} #### Keyset Pagination @@ -230,6 +380,21 @@ const documents = await collection.get_documents({ limit: 100, last_row_id: 10 } documents = await collection.get_documents({ "limit": 100, "last_row_id": 10 }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some(serde_json::json!({"limit": 100, "last_row_id": 10}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"last_row_id\": 10}", &r_size); +``` +{% endtab %} {% endtabs %} The `last_row_id` can be taken from the `row_id` field in the returned document's dictionary. Keyset pagination does not currently work when specifying the `order_by` key. @@ -264,6 +429,29 @@ documents = await collection.get_documents( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 100, + "filter": { + "id": {"$eq": "document_one"}, + } + }) + .into(), + )) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"filter\": {\"id\": {\"$eq\": \"document_one\"}}}", &r_size); +``` +{% endtab %} {% endtabs %} ### Sorting Documents @@ -294,6 +482,30 @@ documents = await collection.get_documents({ }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 100, + "offset": 10, + "order_by": { + "id": "desc" + } + }) + .into(), + )) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"offset\": 10, \"order_by\": {\"id\": \"desc\"}}", &r_size); +``` +{% endtab %} {% endtabs %} ### Deleting Documents @@ -320,4 +532,25 @@ documents = await collection.delete_documents( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .delete_documents( + serde_json::json!({ + "id": { + "$eq": 1 + } + }) + .into(), + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +pgml_collectionc_delete_documents(collection, "{\"id\": { \"$eq\": 1}}"); +``` +{% endtab %} {% endtabs %} diff --git a/pgml-cms/docs/api/client-sdk/pipelines.md b/pgml-cms/docs/api/client-sdk/pipelines.md index c51987cad..6c3ed57cd 100644 --- a/pgml-cms/docs/api/client-sdk/pipelines.md +++ b/pgml-cms/docs/api/client-sdk/pipelines.md @@ -57,6 +57,48 @@ pipeline = Pipeline( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!({ + "title": { + "full_text_search": {"configuration": "english"}, + }, + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + }, + }, + }) + .into(), + ), +)?; + +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new( + "test_pipeline", + "{\ + \"title\": {\ + \"full_text_search\": {\"configuration\": \"english\"},\ + },\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + }\ + }\ + }" +); +``` +{% endtab %} {% endtabs %} This `Pipeline` does two things. For each document in the `Collection`, it converts all `title`s into tsvectors enabling full text search, and splits and embeds the `body` text enabling semantic search using vectors. This kind of `Pipeline` would be great for site search utilizing hybrid keyword and semantic search. @@ -92,6 +134,42 @@ pipeline = Pipeline( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!({ + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + }, + }, + }) + .into(), + ), +)?; + +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new( + "test_pipeline", + "{\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + }\ + }\ + }" +); +``` +{% endtab %} {% endtabs %} This `Pipeline` splits and embeds the `body` text enabling semantic search using vectors. This is a very popular `Pipeline` for RAG. @@ -166,6 +244,44 @@ pipeline = Pipeline( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!({ + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + "hnsw": {"m": 100, "ef_construction": 200} + }, + }, + }) + .into(), + ), +)?; + +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new( + "test_pipeline", + "{\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\",\ + \"hnsw\": {\"m\": 100, \"ef_construction\": 200}\ + }\ + }\ + }" +); +``` +{% endtab %} {% endtabs %} ## Adding Pipelines to a Collection @@ -184,6 +300,18 @@ await collection.add_pipeline(pipeline) await collection.add_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +pgml_collectionc_add_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} > Note: After a `Pipeline` has been added to a `Collection` instances of the `Pipeline` object can be created without specifying a schema: @@ -200,6 +328,18 @@ const pipeline = pgml.newPipeline("test_pipeline") pipeline = Pipeline("test_pipeline") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new("test_pipeline", None)?; +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +``` +{% endtab %} {% endtabs %} ## Searching with Pipelines @@ -231,6 +371,22 @@ collection = Collection("test_collection") await collection.disable_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +let mut pipeline = Pipeline::new("test_pipeline", None)?; +collection.disable_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +pgml_collectionc_disable_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} Disabling a `Pipeline` prevents it from running automatically, but leaves all tsvectors, chunks, and embeddings already created by that `Pipeline` in the database. @@ -255,6 +411,22 @@ collection = Collection("test_collection") await collection.enable_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +let mut pipeline = Pipeline::new("test_pipeline", None)?; +collection.enable_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +pgml_collectionc_enable_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} Enabling a `Pipeline` will cause it to automatically run on all documents it may have missed while disabled. @@ -263,10 +435,10 @@ Enabling a `Pipeline` will cause it to automatically run on all documents it may {% tabs %} {% tab title="JavaScript" %} -
const pipeline = pgml.newPipeline("test_pipeline")
-const collection = pgml.newCollection("test_collection")
-await collection.remove_pipeline(pipeline)
-
+```javascript +const pipeline = pgml.newPipeline("test_pipeline") +const collection = pgml.newCollection("test_collection") +await collection.remove_pipeline(pipeline) {% endtab %} {% tab title="Python" %} @@ -276,6 +448,22 @@ collection = Collection("test_collection") await collection.remove_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +let mut pipeline = Pipeline::new("test_pipeline", None)?; +collection.remove_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +pgml_collectionc_remove_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} Removing a `Pipeline` deletes it and all associated data from the database. Removed `Pipelines` cannot be re-enabled but can be recreated. diff --git a/pgml-cms/docs/api/client-sdk/search.md b/pgml-cms/docs/api/client-sdk/search.md index 8318a8bee..3fc564c55 100644 --- a/pgml-cms/docs/api/client-sdk/search.md +++ b/pgml-cms/docs/api/client-sdk/search.md @@ -10,14 +10,14 @@ This section will assume we have previously ran the following code: const pipeline = pgml.newPipeline("test_pipeline", { abstract: { semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, full_text_search: { configuration: "english" }, }, body: { splitter: { model: "recursive_character" }, semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, }, }); @@ -33,19 +33,70 @@ pipeline = Pipeline( { "abstract": { "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, "full_text_search": {"configuration": "english"}, }, "body": { "splitter": {"model": "recursive_character"}, "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, }, }, ) collection = Collection("test_collection") +await collection.add_pipeline(pipeline); +``` +{% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!( + { + "abstract": { + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + "full_text_search": {"configuration": "english"}, + }, + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + }, + } + ) + .into(), + ), +)?; +let mut collection = Collection::new("test_collection", None)?; +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC *pipeline = pgml_pipelinec_new("test_pipeline", "{\ + \"abstract\": {\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + },\ + \"full_text_search\": {\"configuration\": \"english\"}\ + },\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + }\ + }\ +}"); +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +pgml_collectionc_add_pipeline(collection, pipeline); ``` {% endtab %} {% endtabs %} @@ -63,8 +114,8 @@ const results = await collection.vector_search( fields: { body: { query: "What is the best database?", parameters: { - instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + prompt: + "Represent this sentence for searching relevant passages: ", } }, }, @@ -85,7 +136,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "prompt": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -96,9 +147,56 @@ results = await collection.vector_search( ) ``` {% endtab %} +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "prompt": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"prompt\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + }\ + },\ + \"limit\": 5\ +}", +pipeline, &r_size); +``` +{% endtab %} {% endtabs %} -Let's break this down. `vector_search` takes in a `JSON` object and a `Pipeline`. The `JSON` object currently supports two keys: `query` and `limit` . The `limit` limits how many chunks should be returned, the `query` specifies the actual query to perform. Let's see another more complicated example: +Let's break this down. `vector_search` takes in a `JSON` object and a `Pipeline`. The `JSON` object currently supports two keys: `query` and `limit` . The `limit` limits how many chunks should be returned, the `query` specifies the actual query to perform. + +Note that `mixedbread-ai/mxbai-embed-large-v1` takes in a prompt when creating embeddings for searching against a corpus which we provide in the `parameters`. + +Let's see another more complicated example: {% tabs %} {% tab title="JavaScript" %} @@ -115,7 +213,7 @@ const results = await collection.vector_search( body: { query: query, parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -141,7 +239,7 @@ results = await collection.vector_search( "body": { "query": query, "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -151,6 +249,59 @@ results = await collection.vector_search( pipeline, ) +``` +{% endtab %} + +{% endtab %} +{% tab title="Rust" %} +```rust +let query = "What is the best database?"; +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "abastract": { + "query": query, + "full_text_filter": "database", + }, + "body": { + "query": query, + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"abastract\": {\ + \"query\": \"What is the best database?\",\ + \"full_text_filter\": \"database\"\ + },\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + }\ + },\ + \"limit\": 5,\ +}", pipeline, &r_size); ``` {% endtab %} {% endtabs %} @@ -173,7 +324,7 @@ const results = await collection.vector_search( body: { query: "What is the best database?", parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -199,7 +350,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -211,6 +362,52 @@ results = await collection.vector_search( ) ``` {% endtab %} + +{% endtab %} +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + "filter": {"user_id": {"$eq": 1}}, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + },\ + \"filter\": {\"user_id\": {\"$eq\": 1}}\ + },\ + \"limit\": 5\ +}", pipeline, &r_size); +``` +{% endtab %} {% endtabs %} The above query would filter out all chunks from documents that do not contain a key `user_id` equal to `1`. @@ -227,7 +424,7 @@ const results = await collection.vector_search( body: { query: "What is the best database?", parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -253,7 +450,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -265,6 +462,52 @@ results = await collection.vector_search( ) ``` {% endtab %} + +{% endtab %} +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + "filter": {"user_id": {"$gte": 1}}, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + },\ + \"filter\": {\"user_id\": {\"$eq\": 1}}\ + },\ + \"limit\": 5\ +}", pipeline, &r_size); +``` +{% endtab %} {% endtabs %} The above query would filter out all documents that do not contain a key `user_id` with a value greater than or equal to `1`. @@ -281,7 +524,7 @@ const results = await collection.vector_search( body: { query: "What is the best database?", parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -325,7 +568,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, From 7efe6d9fc8d973452381140e9c5852119033a360 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:48:35 -0700 Subject: [PATCH 2/4] Updated everything to have rust and c --- .../docs/api/client-sdk/document-search.md | 117 +++++++++++++++++- pgml-cms/docs/api/client-sdk/search.md | 59 ++++++++- 2 files changed, 169 insertions(+), 7 deletions(-) diff --git a/pgml-cms/docs/api/client-sdk/document-search.md b/pgml-cms/docs/api/client-sdk/document-search.md index cf91f95ee..4ada75d7f 100644 --- a/pgml-cms/docs/api/client-sdk/document-search.md +++ b/pgml-cms/docs/api/client-sdk/document-search.md @@ -10,14 +10,14 @@ This section will assume we have previously ran the following code: const pipeline = pgml.newPipeline("test_pipeline", { abstract: { semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, full_text_search: { configuration: "english" }, }, body: { splitter: { model: "recursive_character" }, semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, }, }); @@ -33,14 +33,14 @@ pipeline = Pipeline( { "abstract": { "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, "full_text_search": {"configuration": "english"}, }, "body": { "splitter": {"model": "recursive_character"}, "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, }, }, @@ -48,8 +48,60 @@ pipeline = Pipeline( collection = Collection("test_collection") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!( + { + "abstract": { + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + "full_text_search": {"configuration": "english"}, + }, + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + }, + } + ) + .into(), + ), +)?; +let mut collection = Collection::new("test_collection", None)?; +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC *pipeline = pgml_pipelinec_new("test_pipeline", "{\ + \"abstract\": {\ + \"semantic_search\": {\ + \"model\": \"mixedbread-ai/mxbai-embed-large-v1\"\ + },\ + \"full_text_search\": {\"configuration\": \"english\"}\ + },\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"mixedbread-ai/mxbai-embed-large-v1\"\ + }\ + }\ +}"); +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +pgml_collectionc_add_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} +This creates a `Pipeline` that is capable of full text search and semantic search on the `abstract` and semantic search on the `body` of documents. + ## Doing Document Search {% tabs %} @@ -108,6 +160,63 @@ results = await collection.search( ) ``` {% endtab %} + + +{% tab title="Rust" %} +```rust +let results = collection + .search(serde_json::json!({ + "query": { + "full_text_search": { + "abstract": {"query": "What is the best database?", "boost": 1.2} + }, + "semantic_search": { + "abstract": { + "query": "What is the best database?", + "boost": 2.0, + }, + "body": { + "query": "What is the best database?", + "boost": 1.25, + "parameters": { + "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + }, + }, + }, + "filter": {"user_id": {"$eq": 1}}, + }, + "limit": 10, + }).into(), &mut pipeline) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * results = pgml_collectionc_search(collection, "\ + \"query\": {\ + \"full_text_search\": {\ + \"abstract\": {\"query\": \"What is the best database?\", \"boost\": 1.2}\ + },\ + \"semantic_search\": {\ + \"abstract\": {\ + \"query\": \"What is the best database?\",\ + \"boost\": 2.0\ + },\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"boost\": 1.25,\ + \"parameters\": {\ + \"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"\ + }\ + }\ + },\ + \"filter\": {\"user_id\": {\"$eq\": 1}}\ + },\ + \"limit\": 10\ +", pipeline); +``` +{% endtab %} {% endtabs %} Just like `vector_search`, `search` takes in two arguments. The first is a `JSON` object specifying the `query` and `limit` and the second is the `Pipeline`. The `query` object can have three fields: `full_text_search`, `semantic_search` and `filter`. Both `full_text_search` and `semantic_search` function similarly. They take in the text to compare against, titled`query`, an optional `boost` parameter used to boost the effectiveness of the ranking, and `semantic_search` also takes in an optional `parameters` key which specify parameters to pass to the embedding model when embedding the passed in text. diff --git a/pgml-cms/docs/api/client-sdk/search.md b/pgml-cms/docs/api/client-sdk/search.md index 3fc564c55..2d5b5ce41 100644 --- a/pgml-cms/docs/api/client-sdk/search.md +++ b/pgml-cms/docs/api/client-sdk/search.md @@ -147,6 +147,7 @@ results = await collection.vector_search( ) ``` {% endtab %} + {% tab title="Rust" %} ```rust let results = collection @@ -252,7 +253,6 @@ results = await collection.vector_search( ``` {% endtab %} -{% endtab %} {% tab title="Rust" %} ```rust let query = "What is the best database?"; @@ -363,7 +363,6 @@ results = await collection.vector_search( ``` {% endtab %} -{% endtab %} {% tab title="Rust" %} ```rust let results = collection @@ -463,7 +462,6 @@ results = await collection.vector_search( ``` {% endtab %} -{% endtab %} {% tab title="Rust" %} ```rust let results = collection @@ -585,6 +583,61 @@ results = await collection.vector_search( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + "filter": { + "$or": [ + {"$and": [{"$eq": {"user_id": 1}}, {"$lt": {"user_score": 100}}]}, + {"special": {"$ne": True}}, + ], + }, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + },\ + \"filter\": {\ + \"$or\": [\ + {\"$and\": [{\"$eq\": {\"user_id\": 1}}, {\"$lt\": {\"user_score\": 100}}]},\ + {\"special\": {\"$ne\": True}}\ + ]\ + }\ + },\ + \"limit\": 5\ +}", pipeline, &r_size); +``` +{% endtab %} {% endtabs %} The above query would filter out all documents that do not have a key `special` with a value `True` or (have a key `user_id` equal to 1 and a key `user_score` less than 100). From b494857dc00f6935740f24e214fbbcd4b226ec95 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 5 Jun 2024 12:05:43 -0700 Subject: [PATCH 3/4] Rust and c docs ready to go --- pgml-cms/docs/api/client-sdk/README.md | 3 ++- pgml-cms/docs/api/client-sdk/pipelines.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pgml-cms/docs/api/client-sdk/README.md b/pgml-cms/docs/api/client-sdk/README.md index 0ccddb9f0..5e6fc56a0 100644 --- a/pgml-cms/docs/api/client-sdk/README.md +++ b/pgml-cms/docs/api/client-sdk/README.md @@ -89,9 +89,10 @@ async def main(): {% tab title="Rust" %} ```rust use pgml::{Collection, Pipeline}; +use anyhow::Error; #[tokio::main] -async fn main() -> Result<(), Box> { +async fn main() -> Result<(), Error> { let mut collection = Collection::new("sample_collection", None)?; } ``` diff --git a/pgml-cms/docs/api/client-sdk/pipelines.md b/pgml-cms/docs/api/client-sdk/pipelines.md index 6c3ed57cd..dccf3f2b7 100644 --- a/pgml-cms/docs/api/client-sdk/pipelines.md +++ b/pgml-cms/docs/api/client-sdk/pipelines.md @@ -439,6 +439,7 @@ Enabling a `Pipeline` will cause it to automatically run on all documents it may const pipeline = pgml.newPipeline("test_pipeline") const collection = pgml.newCollection("test_collection") await collection.remove_pipeline(pipeline) +``` {% endtab %} {% tab title="Python" %} From f9803076957752a69c3d49de3792df94ad661ac7 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 5 Jun 2024 12:24:38 -0700 Subject: [PATCH 4/4] Updated to make highlighting work --- pgml-dashboard/package-lock.json | 20 +++++++++++++++++++ pgml-dashboard/package.json | 1 + .../code_block/code_block_controller.js | 3 +++ pgml-dashboard/src/utils/markdown.rs | 2 ++ 4 files changed, 26 insertions(+) diff --git a/pgml-dashboard/package-lock.json b/pgml-dashboard/package-lock.json index 4fe4783c7..1da57fd91 100644 --- a/pgml-dashboard/package-lock.json +++ b/pgml-dashboard/package-lock.json @@ -5,6 +5,7 @@ "packages": { "": { "dependencies": { + "@codemirror/lang-cpp": "^6.0.2", "@codemirror/lang-javascript": "^6.2.1", "@codemirror/lang-json": "^6.0.1", "@codemirror/lang-python": "^6.1.3", @@ -46,6 +47,15 @@ "@lezer/common": "^1.1.0" } }, + "node_modules/@codemirror/lang-cpp": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/@codemirror/lang-cpp/-/lang-cpp-6.0.2.tgz", + "integrity": "sha512-6oYEYUKHvrnacXxWxYa6t4puTlbN3dgV662BDfSH8+MfjQjVmP697/KYTDOqpxgerkvoNm7q5wlFMBeX8ZMocg==", + "dependencies": { + "@codemirror/language": "^6.0.0", + "@lezer/cpp": "^1.0.0" + } + }, "node_modules/@codemirror/lang-javascript": { "version": "6.2.2", "resolved": "https://registry.npmjs.org/@codemirror/lang-javascript/-/lang-javascript-6.2.2.tgz", @@ -143,6 +153,16 @@ "resolved": "https://registry.npmjs.org/@lezer/common/-/common-1.2.1.tgz", "integrity": "sha512-yemX0ZD2xS/73llMZIK6KplkjIjf2EvAHcinDi/TfJ9hS25G0388+ClHt6/3but0oOxinTcQHJLDXh6w1crzFQ==" }, + "node_modules/@lezer/cpp": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@lezer/cpp/-/cpp-1.1.2.tgz", + "integrity": "sha512-macwKtyeUO0EW86r3xWQCzOV9/CF8imJLpJlPv3sDY57cPGeUZ8gXWOWNlJr52TVByMV3PayFQCA5SHEERDmVQ==", + "dependencies": { + "@lezer/common": "^1.2.0", + "@lezer/highlight": "^1.0.0", + "@lezer/lr": "^1.0.0" + } + }, "node_modules/@lezer/highlight": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/@lezer/highlight/-/highlight-1.2.0.tgz", diff --git a/pgml-dashboard/package.json b/pgml-dashboard/package.json index bc2860eaa..be19da478 100644 --- a/pgml-dashboard/package.json +++ b/pgml-dashboard/package.json @@ -3,6 +3,7 @@ "@codemirror/lang-javascript": "^6.2.1", "@codemirror/lang-python": "^6.1.3", "@codemirror/lang-rust": "^6.0.1", + "@codemirror/lang-cpp": "^6.0.2", "postgresml-lang-sql": "^6.6.3-5", "@codemirror/lang-json": "^6.0.1", "@codemirror/state": "^6.2.1", diff --git a/pgml-dashboard/src/components/code_block/code_block_controller.js b/pgml-dashboard/src/components/code_block/code_block_controller.js index 8817ea08c..25b06a97e 100644 --- a/pgml-dashboard/src/components/code_block/code_block_controller.js +++ b/pgml-dashboard/src/components/code_block/code_block_controller.js @@ -4,6 +4,7 @@ import { sql } from "postgresml-lang-sql"; import { python } from "@codemirror/lang-python"; import { javascript } from "@codemirror/lang-javascript"; import { rust } from "@codemirror/lang-rust"; +import { cpp } from "@codemirror/lang-cpp"; import { json } from "@codemirror/lang-json"; import { EditorView, ViewPlugin, Decoration } from "@codemirror/view"; import { RangeSetBuilder, Facet } from "@codemirror/state"; @@ -84,6 +85,8 @@ const language = (element) => { return rust; case "json": return json; + case "cpp": + return cpp; default: return null; } diff --git a/pgml-dashboard/src/utils/markdown.rs b/pgml-dashboard/src/utils/markdown.rs index 3863dae2e..f55e0ee7a 100644 --- a/pgml-dashboard/src/utils/markdown.rs +++ b/pgml-dashboard/src/utils/markdown.rs @@ -208,6 +208,8 @@ impl<'a> From<&str> for CodeFence<'a> { "postgresql-line-nums" } else if options.starts_with("rust") { "rust" + } else if options.starts_with("cpp") { + "cpp" } else if options.starts_with("json") { "json" } else {