Almost ready for 0.9.1

postgresml · SilasMarvin · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
commit 4462308f6ed976dbffe0cc30c88ce93d10d591be
diff --git a/pgml-sdks/pgml/Cargo.lock b/pgml-sdks/pgml/Cargo.lock
diff --git a/pgml-sdks/pgml/Cargo.toml b/pgml-sdks/pgml/Cargo.toml
@@ -15,7 +15,7 @@ crate-type = ["lib", "cdylib"]
 
 [dependencies]
 rust_bridge = {path = "../rust-bridge/rust-bridge", version = "0.1.0"}
-sqlx = { version = "0.6", features = [ "runtime-tokio-rustls", "postgres", "json", "time", "uuid", "chrono"] }
+sqlx = { version = "0.7", features = [ "runtime-tokio-rustls", "postgres", "json", "time", "uuid", "chrono"] }
 serde_json = "1.0.9"
 anyhow = "1.0.9"
 tokio = { version = "1.28.2", features = [ "macros" ] }
@@ -26,10 +26,10 @@ neon = { version = "0.10", optional = true, default-features = false, features =
 itertools = "0.10.5"
 uuid = {version = "1.3.3", features = ["v4", "serde"] }
 md5 = "0.7.0"
-sea-query = { version = "0.28.5", features = ["attr", "thread-safe", "with-json", "postgres-array"] }
-sea-query-binder = { version = "0.3.1", features = ["sqlx-postgres", "with-json", "postgres-array"] }
+sea-query = { version = "0.30.1", features = ["attr", "thread-safe", "with-json", "postgres-array"] }
+sea-query-binder = { version = "0.5.0", features = ["sqlx-postgres", "with-json", "postgres-array"] }
 regex = "1.8.4"
-reqwest = { version = "0.11", features = ["json"] }
+reqwest = { version = "0.11", features = ["json", "native-tls-vendored"] }
 async-trait = "0.1.71"
 tracing = { version = "0.1.37" }
 tracing-subscriber = { version = "0.3.17", features = ["json"] }

diff --git a/pgml-sdks/pgml/javascript/README.md b/pgml-sdks/pgml/javascript/README.md
@@ -208,9 +208,11 @@ const collection = pgml.newCollection("test_collection", CUSTOM_DATABASE_URL)
 
 ### Upserting Documents
 
-Documents are dictionaries with two required keys: `id` and `text`. All other keys/value pairs are stored as metadata for the document.
+The `upsert_documents` method can be used to insert new documents and update existing documents.
 
-**Upsert documents with metadata**
+New documents are dictionaries with two required keys: `id` and `text`. All other keys/value pairs are stored as metadata for the document.
+
+**Upsert new documents with metadata**
 ```javascript
 const documents = [
     {
@@ -228,6 +230,98 @@ const collection = pgml.newCollection("test_collection")
 await collection.upsert_documents(documents)
 ```
 
+Document metadata can be updated by upserting the document without the `text` key.
+
+**Update document metadata**
+```javascript
+documents = [
+    {
+        id: "Document 1",
+        random_key: "this will be NEW metadata for the document"
+    },
+    {
+        id: "Document 2",
+        random_key: "this will be NEW metadata for the document"
+    }
+]
+collection = pgml.newCollection("test_collection")
+await collection.upsert_documents(documents)
+```
+
+### Getting Documents
+
+Documents can be retrieved using the `get_documents` method on the collection object
+
+**Get the first 100 documents**
+```javascript
+collection = pgml.newCollection("test_collection")
+documents = await collection.get_documents({ limit: 100 })
+```
+
+#### Pagination
+
+The JavaScript SDK supports limit-offset pagination and keyset pagination
+
+**Limit-Offset pagination**
+```javascript
+collection = pgml.newCollection("test_collection")
+documents = await collection.get_documents({ limit: 100, offset: 10 })
+```
+
+**Keyset pagination**
+```javascript
+collection = pgml.newCollection("test_collection")
+documents = await collection.get_documents({ limit: 100, last_row_id: 10 })
+```
+
+The `last_row_id` can be taken from the `row_id` field in the returned document's dictionary.
+
+#### Filtering
+
+Metadata and full text filtering are supported just like they are in vector recall.
+
+**Metadata and full text filtering**
+```javascript
+collection = pgml.newCollection("test_collection")
+documents = await collection.get_documents({
+    limit: 100,
+    offset: 10,
+    filter: {
+        metadata: {
+            id: {
+                $eq: 1
+            }
+        },
+        full_text_search: {
+            configuration: "english",
+            text: "Some full text query"
+        }
+    }
+})
+
+```
+
+### Deleting Documents
+
+Documents can be deleted with the `delete_documents` method on the collection object.
+
+Metadata and full text filtering are supported just like they are in vector recall.
+
+```javascript
+collection = pgml.newCollection("test_collection")
+documents = await collection.delete_documents({
+    metadata: {
+        id: {
+            $eq: 1
+        }
+    },
+    full_text_search: {
+        configuration: "english",
+        text: "Some full text query"
+    }
+})
+```
+
 ### Searching Collections
 
 The JavaScript SDK is specifically designed to provide powerful, flexible vector search.
@@ -326,7 +420,7 @@ const results = await collection.query()
     .fetch_all()
 ```
 
-The above query would filter out all documents that do not have a key `special` with a value `True` or (have a key `uuid` equal to 1 and a key `index` less than 100).
+The above query would filter out all documents that do not have a key `special` with a value `true` or (have a key `uuid` equal to 1 and a key `index` less than 100).
 
 #### Full Text Filtering
 
@@ -418,7 +512,7 @@ const model = pgml.newModel()
 const splitter = pgml.newSplitter()
 const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
     "full_text_search": {
-        active: True,
+        active: true,
         configuration: "english"
     }
 })

diff --git a/pgml-sdks/pgml/javascript/package.json b/pgml-sdks/pgml/javascript/package.json
@@ -1,6 +1,6 @@
 {
   "name": "pgml",
-  "version": "0.9.0",
+  "version": "0.9.1",
   "description": "Open Source Alternative for Building End-to-End Vector Search Applications without OpenAI & Pinecone",
   "keywords": ["postgres", "machine learning", "vector databases", "embeddings"],
   "main": "index.js",

diff --git a/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts b/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts
@@ -21,6 +21,7 @@ const generate_dummy_documents = (count: number) => {
       project: "a10",
       uuid: i * 10,
       floating_uuid: i * 1.1,
+      test: null,
       name: `Test Document ${i}`,
     });
   }
@@ -156,3 +157,66 @@ it("pipeline to dict", async () => {
   expect(pipeline_dict["name"]).toBe("test_j_p_ptd_0");
   await collection.archive();
 });
+
+///////////////////////////////////////////////////
+// Test document related functions ////////////////
+///////////////////////////////////////////////////
+
+it("can upsert and get documents", async () => {
+  let model = pgml.newModel();
+  let splitter = pgml.newSplitter();
+  let pipeline = pgml.newPipeline("test_p_p_cuagd_0", model, splitter, {
+    full_text_search: { active: true, configuration: "english" },
+  });
+  let collection = pgml.newCollection("test_p_c_cuagd_1");
+  await collection.add_pipeline(pipeline);
+  await collection.upsert_documents(generate_dummy_documents(10));
+
+  let documents = await collection.get_documents();
+  expect(documents).toHaveLength(10);
+
+  documents = await collection.get_documents({
+    offset: 1,
+    limit: 2,
+    filter: { metadata: { id: { $gt: 0 } } },
+  });
+  expect(documents).toHaveLength(2);
+  expect(documents[0]["document"]["id"]).toBe(2);
+  let last_row_id = documents[1]["row_id"];
+
+  documents = await collection.get_documents({
+    filter: {
+      metadata: { id: { $gt: 3 } },
+      full_text_search: { configuration: "english", text: "4" },
+    },
+    last_row_id: last_row_id,
+  });
+  expect(documents).toHaveLength(1);
+  expect(documents[0]["document"]["id"]).toBe(4);
+
+  await collection.archive();
+});
+
+it("can delete documents", async () => {
+  let model = pgml.newModel();
+  let splitter = pgml.newSplitter();
+  let pipeline = pgml.newPipeline(
+    "test_p_p_cdd_0",
+    model,
+    splitter,
+
+    { full_text_search: { active: true, configuration: "english" } },
+  );
+  let collection = pgml.newCollection("test_p_c_cdd_2");
+  await collection.add_pipeline(pipeline);
+  await collection.upsert_documents(generate_dummy_documents(3));
+  await collection.delete_documents({
+    metadata: { id: { $gte: 0 } },
+    full_text_search: { configuration: "english", text: "0" },
+  });
+  let documents = await collection.get_documents();
+  expect(documents).toHaveLength(2);
+  expect(documents[0]["document"]["id"]).toBe(1);
+
+  await collection.archive();
+});
diff --git a/pgml-sdks/pgml/pyproject.toml b/pgml-sdks/pgml/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "maturin"
 [project]
 name = "pgml"
 requires-python = ">=3.7"
-version = "0.9.0"
+version = "0.9.1"
 description = "Python SDK is designed to facilitate the development of scalable vector search applications on PostgreSQL databases."
 authors = [
   {name = "PostgresML",  email = "team@postgresml.org"},

diff --git a/pgml-sdks/pgml/python/README.md b/pgml-sdks/pgml/python/README.md
@@ -213,9 +213,11 @@ collection = Collection("test_collection", CUSTOM_DATABASE_URL)
 
 ### Upserting Documents
 
-Documents are dictionaries with two required keys: `id` and `text`. All other keys/value pairs are stored as metadata for the document.
+The `upsert_documents` method can be used to insert new documents and update existing documents.
 
-**Upsert documents with metadata**
+New documents are dictionaries with two required keys: `id` and `text`. All other keys/value pairs are stored as metadata for the document.
+
+**Upsert new documents with metadata**
 ```python
 documents = [
     {
@@ -233,6 +235,97 @@ collection = Collection("test_collection")
 await collection.upsert_documents(documents)
 ```
 
+Document metadata can be updated by upserting the document without the `text` key.
+
+**Update document metadata**
+```python
+documents = [
+    {
+        "id": "Document 1",
+        "random_key": "this will be NEW metadata for the document"
+    },
+    {
+        "id": "Document 2",
+        "random_key": "this will be NEW metadata for the document"
+    }
+]
+collection = Collection("test_collection")
+await collection.upsert_documents(documents)
+```
+
+### Getting Documents
+
+Documents can be retrieved using the `get_documents` method on the collection object
+
+**Get the first 100 documents**
+```python
+collection = Collection("test_collection")
+documents = await collection.get_documents({ "limit": 100 })
+```
+
+#### Pagination
+
+The Python SDK supports limit-offset pagination and keyset pagination
+
+**Limit-Offset pagination**
+```python
+collection = Collection("test_collection")
+documents = await collection.get_documents({ "limit": 100, "offset": 10 })
+```
+
+**Keyset pagination**
+```python
+collection = Collection("test_collection")
+documents = await collection.get_documents({ "limit": 100, "last_row_id": 10 })
+```
+
+The `last_row_id` can be taken from the `row_id` field in the returned document's dictionary.
+
+#### Filtering
+
+Metadata and full text filtering are supported just like they are in vector recall.
+
+**Metadata and full text filtering**
+```python
+collection = Collection("test_collection")
+documents = await collection.get_documents({
+    "limit": 100,
+    "offset": 10,
+    "filter": {
+        "metadata": {
+            "id": {
+                "$eq": 1
+            }
+        },
+        "full_text_search": {
+            "configuration": "english",
+            "text": "Some full text query"
+        }
+    }
+})
+
+```
+
+### Deleting Documents
+
+Documents can be deleted with the `delete_documents` method on the collection object.
+
+Metadata and full text filtering are supported just like they are in vector recall.
+
+```python
+documents = await collection.delete_documents({
+    "metadata": {
+        "id": {
+            "$eq": 1
+        }
+    },
+    "full_text_search": {
+        "configuration": "english",
+        "text": "Some full text query"
+    }
+})
+```
+
 ### Searching Collections
 
 The Python SDK is specifically designed to provide powerful, flexible vector search.
@@ -350,7 +443,7 @@ results = (
     .vector_recall("Here is some query", pipeline)
     .limit(10)
     .filter({
-        "full_text": {
+        "full_text_search": {
             "configuration": "english",
             "text": "Match Me"
         }