Added rag-retrieval-timing-tests

SilasMarvin · SilasMarvin · commit 5f1a2dc6e800 · 2024-03-07T15:19:05.000-08:00
diff --git a/pgml-apps/rag-retrieval-timing-tests/.env.development b/pgml-apps/rag-retrieval-timing-tests/.env.development
@@ -0,0 +1,6 @@
+PINECONE_API_KEY=
+QDRANT_API_KEY=
+ZILLIZ_API_KEY=
+WCS_API_KEY=
+OPENAI_API_KEY=
+HF_TOKEN=
diff --git a/pgml-apps/rag-retrieval-timing-tests/README.md b/pgml-apps/rag-retrieval-timing-tests/README.md
@@ -0,0 +1,7 @@
+# Rag Timing Tests
+
+This script runs timing tests for common rag systems.
+
+To run it copy `.env.deveopment` to `.env` and make sure to set the appropriate variables in the `.env` file, install the dependencies in `requirements.txt` and run `python3 __main__.py`.
+
+Notice that this script assumes certain actions to create databases or setup "collections" have been performed for each cloud provider. See the script for more details.
diff --git a/pgml-apps/rag-retrieval-timing-tests/__main__.py b/pgml-apps/rag-retrieval-timing-tests/__main__.py
@@ -0,0 +1,161 @@
+import time
+import asyncio
+
+import postgresml as pgl
+import zilliz_local as zl
+import pinecone_local as pl
+import qdrant_local as ql
+import openai_local as al
+import huggingface as hf
+import weaviate_local as wl
+
+TRIAL_COUNT = 2
+
+# The pairs we are testing with
+tests = [
+    {
+        "name": "PostgresML",
+        "vector_store": pgl,
+        "rag+": True,
+        "chatbot_service": al,
+        "async": True,
+    },
+    {"name": "Weaviate", "vector_store": wl, "chatbot_service": al, "rag++": True},
+    {
+        "name": "Zilliz",
+        "vector_store": zl,
+        "embedding_service": hf,
+        "chatbot_service": al,
+    },
+    {
+        "name": "Pinecone",
+        "vector_store": pl,
+        "embedding_service": hf,
+        "chatbot_service": al,
+    },
+    {
+        "name": "Qdrant",
+        "vector_store": ql,
+        "embedding_service": hf,
+        "chatbot_service": al,
+    },
+]
+
+
+# Our documents
+# We only really need to test on 2. When we search we are trying to get the first document back
+documents = [
+    {"id": "0", "metadata": {"text": "The hidden value is 1000"}},
+    {
+        "id": "1",
+        "metadata": {"text": "This is just some random text"},
+    },
+]
+
+
+def maybe_do_async(func, check_dict, *args):
+    if "async" in check_dict and check_dict["async"]:
+        return asyncio.run(func(*args))
+    else:
+        return func(*args)
+
+
+def do_data_upsert(name, vector_store, **kwargs):
+    print(f"Doing Data Upsert For: {name}")
+    if "rag++" in kwargs or "rag+" in kwargs:
+        maybe_do_async(vector_store.upsert_data, kwargs, documents)
+    else:
+        texts = [d["metadata"]["text"] for d in documents]
+        (embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings(texts)
+        maybe_do_async(vector_store.upsert_data, kwargs, documents, embeddings)
+    print(f"Done Doing Data Upsert For: {name}\n")
+
+
+def do_normal_rag_test(name, vector_store, **kwargs):
+    print(f"Doing RAG Test For: {name}")
+    query = "What is the hidden value?"
+    if "rag++" in kwargs:
+        (result, time_to_complete) = maybe_do_async(
+            vector_store.get_llm_response, kwargs, query
+        )
+        time_to_embed = 0
+        time_to_search = 0
+    elif "rag+" in kwargs:
+        time_to_embed = 0
+        (context, time_to_search) = maybe_do_async(
+            vector_store.do_search, kwargs, query
+        )
+        (result, time_to_complete) = kwargs["chatbot_service"].get_llm_response(
+            query, context
+        )
+    else:
+        (embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings(
+            [query]
+        )
+        (context, time_to_search) = vector_store.do_search(embeddings[0])
+        (result, time_to_complete) = kwargs["chatbot_service"].get_llm_response(
+            query, context
+        )
+    print(f"\tThe LLM Said: {result}")
+    time_for_retrieval = time_to_embed + time_to_search
+    total_time = time_to_embed + time_to_search + time_to_complete
+    print(f"Done Doing RAG Test For: {name}")
+    print(f"- Time to Embed: {time_to_embed}")
+    print(f"- Time to Search: {time_to_search}")
+    print(f"- Total Time for Retrieval: {time_for_retrieval}")
+    print(f"- Time for Chatbot Completion: {time_to_complete}")
+    print(f"- Total Time Taken: {total_time}\n")
+    return {
+        "time_to_embed": time_to_embed,
+        "time_to_search": time_to_search,
+        "time_for_retrieval": time_for_retrieval,
+        "time_to_complete": time_to_complete,
+        "total_time": total_time,
+    }
+
+
+if __name__ == "__main__":
+    print("----------Doing Data Setup-------------------------\n")
+    for test in tests:
+        do_data_upsert(**test)
+    print("\n----------Done Doing Data Setup------------------\n\n")
+
+    print("----------Doing Rag Tests-------------------------\n")
+    stats = {}
+    for i in range(TRIAL_COUNT):
+        for test in tests:
+            times = do_normal_rag_test(**test)
+            if not test["name"] in stats:
+                stats[test["name"]] = []
+            stats[test["name"]].append(times)
+    print("\n----------Done Doing Rag Tests---------------------\n")
+
+    print("------------Final Results---------------------------\n")
+    for test in tests:
+        trials = stats[test["name"]]
+        (
+            time_to_embed,
+            time_to_search,
+            time_for_retrieval,
+            time_to_complete,
+            total_time,
+        ) = [
+            sum(trial[key] for trial in trials)
+            for key in [
+                "time_to_embed",
+                "time_to_search",
+                "time_for_retrieval",
+                "time_to_complete",
+                "total_time",
+            ]
+        ]
+        print(f'Done Doing RAG Test For: {test["name"]}')
+        print(f"- Average Time to Embed: {(time_to_embed / TRIAL_COUNT):0.4f}")
+        print(f"- Average Time to Search: {(time_to_search / TRIAL_COUNT):0.4f}")
+        print(
+            f"- Average Total Time for Retrieval: {(time_for_retrieval / TRIAL_COUNT):0.4f}"
+        )
+        print(
+            f"- Average Time for Chatbot Completion: {(time_to_complete / TRIAL_COUNT):0.4f}"
+        )
+        print(f"- Average Total Time Taken: {(total_time / TRIAL_COUNT):0.4f}\n")
diff --git a/pgml-apps/rag-retrieval-timing-tests/huggingface.py b/pgml-apps/rag-retrieval-timing-tests/huggingface.py
@@ -0,0 +1,29 @@
+import requests
+import time
+import os
+import sys
+from dotenv import load_dotenv
+
+# Load our environment variables
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+
+
+# Get the embedding from HuggingFace
+def get_embeddings(inputs):
+    print("\tGetting embeddings from HuggingFace")
+    tic = time.perf_counter()
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    payload = {"inputs": inputs}
+    response = requests.post(
+        "https://api-inference.huggingface.co/pipeline/feature-extraction/intfloat/e5-small",
+        headers=headers,
+        json=payload,
+    )
+    toc = time.perf_counter()
+    time_taken = toc - tic
+    print(f"\tDone getting embeddings: {toc - tic:0.4f}\n")
+    response = response.json()
+    if "error" in response:
+        sys.exit(response)
+    return (response, time_taken)
diff --git a/pgml-apps/rag-retrieval-timing-tests/openai_local.py b/pgml-apps/rag-retrieval-timing-tests/openai_local.py
@@ -0,0 +1,26 @@
+from openai import OpenAI
+import time
+
+# Create our OpenAI client
+client = OpenAI()
+
+
+# Get LLM response from OpenAI
+def get_llm_response(query, context):
+    print("\tGetting LLM response from OpenAI")
+    tic = time.perf_counter()
+    completion = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "system",
+                "content": f"You are a helpful assistant. Given the context, provide an answer to the user: \n{context}",
+            },
+            {"role": "user", "content": query},
+        ],
+    )
+    toc = time.perf_counter()
+    time_taken = toc - tic
+    print(f"\tDone getting the LLM response: {time_taken:0.4f}")
+    response = completion.choices[0].message.content
+    return (response, time_taken)
diff --git a/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py b/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py
@@ -0,0 +1,43 @@
+from pinecone import Pinecone, ServerlessSpec
+from dotenv import load_dotenv
+import time
+import os
+
+# Load our environment variables
+load_dotenv()
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+
+# Create our Pinecone client
+# Note we created their default index using their gcp-start region and us-central1 region
+pc = Pinecone(api_key=PINECONE_API_KEY)
+index = pc.Index("test")
+
+
+# Store some initial documents to retrieve
+def upsert_data(documents, embeddings):
+    for document, embedding in zip(documents, embeddings):
+        document["values"] = embedding
+    print("\tStarting PineCone upsert")
+    tic = time.perf_counter()
+    index.upsert(documents, namespace="ns1")
+    toc = time.perf_counter()
+    time_taken_to_upsert = toc - tic
+    print(f"\tDone PineCone upsert: {time_taken_to_upsert:0.4f}")
+    return time_taken_to_upsert
+
+
+# Do cosine similarity search over our pinecone index
+def do_search(vector):
+    print("\tDoing cosine similarity search with PineCone")
+    tic = time.perf_counter()
+    results = index.query(
+        namespace="ns1",
+        vector=vector,
+        top_k=1,
+        include_metadata=True,
+    )
+    toc = time.perf_counter()
+    time_done = toc - tic
+    print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n")
+    result = results["matches"][0]["metadata"]["text"]
+    return (result, time_done)
diff --git a/pgml-apps/rag-retrieval-timing-tests/postgresml.py b/pgml-apps/rag-retrieval-timing-tests/postgresml.py
@@ -0,0 +1,62 @@
+from pgml import Collection, Pipeline
+from dotenv import load_dotenv
+import time
+
+# Load our environment variables
+load_dotenv()
+
+# Initialize our Collection and Pipeline
+collection = Collection("test_collection")
+pipeline = Pipeline(
+    "test_pipeline",
+    {
+        "text": {
+            "semantic_search": {
+                "model": "intfloat/e5-small",
+            },
+        }
+    },
+)
+
+
+# Add the Pipeline to our collection
+# We only need to do this once
+async def setup_pipeline():
+    await collection.add_pipeline(pipeline)
+
+
+async def upsert_data(documents):
+    documents = [
+        {"id": document["id"], "text": document["metadata"]["text"]}
+        for document in documents
+    ]
+    print("Starting PostgresML upsert")
+    tic = time.perf_counter()
+    await collection.upsert_documents(documents)
+    toc = time.perf_counter()
+    time_taken = toc - tic
+    print(f"Done PostgresML upsert: {time_taken:0.4f}\n")
+
+
+async def do_search(query):
+    print(
+        "\tDoing embedding and cosine similarity search over our PostgresML Collection"
+    )
+    tic = time.perf_counter()
+    results = await collection.vector_search(
+        {
+            "query": {
+                "fields": {
+                    "text": {
+                        "query": query,
+                    },
+                }
+            },
+            "limit": 1,
+        },
+        pipeline,
+    )
+    toc = time.perf_counter()
+    time_taken = toc - tic
+    print(f"\tDone doing embedding and cosine similarity search: {time_taken:0.4f}\n")
+    return (results[0]["chunk"], time_taken)
diff --git a/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py b/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py
@@ -0,0 +1,49 @@
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct
+from dotenv import load_dotenv
+import time
+import os
+
+# Load our environment variables
+load_dotenv()
+QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
+
+# Create our Qdrant client
+qdrant = QdrantClient(
+    url="https://059364f6-62c5-4f80-9f19-cf6d6394caae.us-east4-0.gcp.cloud.qdrant.io:6333",
+    api_key=QDRANT_API_KEY,
+)
+
+# Create our Qdrant collection
+qdrant.recreate_collection(
+    collection_name="test",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+)
+
+
+# Store some initial documents to retrieve
+def upsert_data(documents, embeddings):
+    points = [
+        PointStruct(
+            id=int(document["id"]), vector=embedding, payload=document["metadata"]
+        )
+        for document, embedding in zip(documents, embeddings)
+    ]
+    print("\tStarting Qdrant upsert")
+    tic = time.perf_counter()
+    qdrant.upsert(collection_name="test", points=points)
+    toc = time.perf_counter()
+    time_taken_to_upsert = toc - tic
+    print(f"\tDone Qdrant upsert: {time_taken_to_upsert:0.4f}")
+    return time_taken_to_upsert
+
+
+# Do cosine similarity search over our Qdrant collection
+def do_search(vector):
+    print("\tDoing cosine similarity search with Qdrant")
+    tic = time.perf_counter()
+    results = qdrant.search(collection_name="test", query_vector=vector, limit=1)
+    toc = time.perf_counter()
+    time_done = toc - tic
+    print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n")
+    return (results, time_done)
diff --git a/pgml-apps/rag-retrieval-timing-tests/requirements.txt b/pgml-apps/rag-retrieval-timing-tests/requirements.txt
diff --git a/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py b/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py
diff --git a/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py b/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py