postgresml · levkk · Apr 24, 2024 · Apr 23, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/packages/pgml-rds-proxy/ec2/.gitignore b/packages/pgml-rds-proxy/ec2/.gitignore
@@ -0,0 +1,4 @@
+.terraform
+*.lock.hcl
+*.tfstate
+*.tfstate.backup
diff --git a/packages/pgml-rds-proxy/ec2/README.md b/packages/pgml-rds-proxy/ec2/README.md
@@ -0,0 +1,7 @@
+# Terraform configuration for pgml-rds-proxy on EC2
+
+This is a sample Terraform deployment for running pgml-rds-proxy on EC2. This will spin up an EC2 instance
+with a public IP and a working security group & install the community Docker runtime.
+
+Once the instance is running, you can connect to it using the root key and run the pgml-rds-proxy Docker container
+with the correct PostgresML `DATABASE_URL`.
diff --git a/packages/pgml-rds-proxy/ec2/ec2-deployment.tf b/packages/pgml-rds-proxy/ec2/ec2-deployment.tf
@@ -0,0 +1,84 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.46"
+    }
+  }
+
+  required_version = ">= 1.2.0"
+}
+
+provider "aws" {
+  region = "us-west-2"
+}
+
+data "aws_ami" "ubuntu" {
+  most_recent = true
+
+  filter {
+    name   = "name"
+    values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
+  }
+
+  filter {
+    name   = "virtualization-type"
+    values = ["hvm"]
+  }
+
+  owners = ["099720109477"] # Canonical
+}
+
+resource "aws_security_group" "pgml-rds-proxy" {
+  egress {
+    from_port        = 0
+    to_port          = 0
+    protocol         = "-1"
+    cidr_blocks      = ["0.0.0.0/0"]
+    ipv6_cidr_blocks = ["::/0"]
+  }
+
+  ingress {
+    from_port = 6432
+    to_port = 6432
+    protocol = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+    ipv6_cidr_blocks = ["::/0"]
+  }
+
+  ingress {
+    from_port = 22
+    to_port = 22
+    protocol = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+    ipv6_cidr_blocks = ["::/0"]
+  }
+}
+
+resource "aws_instance" "pgml-rds-proxy" {
+  ami           = data.aws_ami.ubuntu.id
+  instance_type = "t3.micro"
+  key_name = var.root_key
+
+  root_block_device {
+    volume_size = 30
+    delete_on_termination = true
+  }
+
+  vpc_security_group_ids = [
+    "${aws_security_group.pgml-rds-proxy.id}",
+  ]
+
+  associate_public_ip_address = true
+  user_data                   = file("${path.module}/user_data.sh")
+  user_data_replace_on_change = false
+
+  tags = {
+    Name = "pgml-rds-proxy"
+  }
+}
+
+variable "root_key" {
+  type = string
+  description = "The name of the SSH Root Key you'd like to assign to this EC2 instance. Make sure it's a key you have access to."
+}
diff --git a/packages/pgml-rds-proxy/ec2/user_data.sh b/packages/pgml-rds-proxy/ec2/user_data.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#
+# Cloud init script to install Docker on an EC2 instance running Ubuntu 22.04.
+#
+
+sudo apt-get update
+sudo apt-get install ca-certificates curl
+sudo install -m 0755 -d /etc/apt/keyrings
+sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+sudo chmod a+r /etc/apt/keyrings/docker.asc
+
+# Add the repository to Apt sources:
+echo \
+  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+sudo apt-get update
+
+sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+sudo groupadd docker
+sudo usermod -aG docker ubuntu
diff --git a/pgml-cms/.gitignore b/pgml-cms/.gitignore
@@ -0,0 +1 @@
+*.md.bak
diff --git a/pgml-cms/blog/.gitbook/assets/pgml_rds_proxy_arch.png b/pgml-cms/blog/.gitbook/assets/pgml_rds_proxy_arch.png
diff --git a/pgml-cms/docs/.gitbook/assets/architecture.png b/pgml-cms/docs/.gitbook/assets/architecture.png
diff --git a/pgml-cms/docs/.gitbook/assets/fdw_1.png b/pgml-cms/docs/.gitbook/assets/fdw_1.png
diff --git a/pgml-cms/docs/.gitbook/assets/logical_replication_1.png b/pgml-cms/docs/.gitbook/assets/logical_replication_1.png
diff --git a/pgml-cms/docs/.gitbook/assets/vpc_1.png b/pgml-cms/docs/.gitbook/assets/vpc_1.png
diff --git a/pgml-cms/docs/SUMMARY.md b/pgml-cms/docs/SUMMARY.md
@@ -7,8 +7,9 @@
   * [Create your database](introduction/getting-started/create-your-database.md)
   * [Connect your app](introduction/getting-started/connect-your-app.md)
   * [Import your data](introduction/getting-started/import-your-data/README.md)
-    * [CSV](introduction/getting-started/import-your-data/csv.md)
-    * [Foreign Data Wrapper](introduction/getting-started/import-your-data/foreign-data-wrapper.md)
+    * [Logical replication](introduction/getting-started/import-your-data/logical-replication/README.md)
+    * [Foreign Data Wrappers](introduction/getting-started/import-your-data/foreign-data-wrappers.md)
+    * [COPY](introduction/getting-started/import-your-data/copy.md)
 
 ## API
 
@@ -50,7 +51,7 @@
 
 ## Product
 
-* [Cloud Database](product/cloud-database/README.md)
+* [AI Database](product/cloud-database/README.md)
   * [Serverless databases](product/cloud-database/serverless-databases.md)
   * [Dedicated](product/cloud-database/dedicated.md)
   * [Enterprise](product/cloud-database/plans.md)
@@ -79,7 +80,7 @@
 ## Resources
 
 * [FAQs](resources/faqs.md)
-* [Data Storage & Retrieval](resources/data-storage-and-retrieval/README.md)
+* [Data Storage & Retrieval](resources/data-storage-and-retrieval/tabular-data.md)
   * [Tabular data](resources/data-storage-and-retrieval/tabular-data.md)
-  * [Tabular data](resources/data-storage-and-retrieval/tabular-data.md)
-  * [Tabular data](resources/data-storage-and-retrieval/tabular-data.md)
   * [Documents](resources/data-storage-and-retrieval/documents.md)
   * [Partitioning](resources/data-storage-and-retrieval/partitioning.md)

diff --git a/pgml-cms/docs/introduction/getting-started/README.md b/pgml-cms/docs/introduction/getting-started/README.md
@@ -4,11 +4,11 @@ description: Setup a database and connect your application to PostgresML
 
 # Getting Started
 
-A PostgresML deployment consists of multiple components working in concert to provide a complete Machine Learning platform. We provide a fully managed solution in our cloud.
+A PostgresML deployment consists of multiple components working in concert to provide a complete Machine Learning platform. We provide a fully managed solution in our cloud, and document a self-hosted installation in our docs.
 
-* A PostgreSQL database, with pgml and pgvector extensions installed, including backups, metrics, logs, replicas and high availability configurations
-* A PgCat pooling proxy to provide secure access and model load balancing across tens of thousands of clients
-* A web application to manage deployed models and host SQL notebooks
+* PostgreSQL database, with `pgml`, `pgvector` and many other extensions installed, including backups, metrics, logs, replicas and high availability
+* PgCat pooler to provide secure access and model load balancing across thousands of clients
+* A web application to manage deployed models and write experiments in SQL notebooks
-* A web application to manage deployed models and write experiments in SQL notebooks
+* A web application to manage deployed models and share experiments and analysis in SQL notebooks
-* A web application to manage deployed models and write experiments in SQL notebooks
+* A web application to manage deployed models and share experiments and analysis in SQL notebooks
 
 <figure><img src="../../.gitbook/assets/architecture.png" alt=""><figcaption></figcaption></figure>
 

diff --git a/pgml-cms/docs/introduction/getting-started/connect-your-app.md b/pgml-cms/docs/introduction/getting-started/connect-your-app.md
@@ -4,13 +4,13 @@ description: PostgresML is compatible with all standard PostgreSQL clients
 
 # Connect your app
 
-You can connect to your database from any Postgres compatible client. PostgresML is intended to serve in the traditional role of an application database, along with it's extended role as an MLOps platform to make it easy to build and maintain AI applications.
+You can connect to your database from any PostgresSQL-compatible client. PostgresML is intended to serve in the traditional role of an application database, along with it's extended role as an MLOps platform to make it easy to build and maintain AI applications together with your application data.
 
-## Application SDKs
+## SDK
-## SDK
+## Client SDKs
-## SDK
+## Client SDKs
 
-We provide client SDKs for JavaScript, Python and Rust apps that manage connections to the Postgres database and make it easy to construct efficient queries for AI use cases, like managing a document collection for RAG, or building a chatbot. All of the ML & AI still happens in the database, with centralized operations, hardware and dependency management.
+We provide a client SDK for JavaScript, Python and Rust. The SDK manages connections to the Postgres database and makes it easy to construct efficient queries for AI use cases, like managing a document collection for RAG, or building a chatbot. All of the ML & AI still happenening inside the database, with centralized operations, hardware and dependency management.
-We provide a client SDK for JavaScript, Python and Rust. The SDK manages connections to the Postgres database and makes it easy to construct efficient queries for AI use cases, like managing a document collection for RAG, or building a chatbot. All of the ML & AI still happenening inside the database, with centralized operations, hardware and dependency management.
+We provide a client SDK for JavaScript, Python and Rust. The SDK manages connections to the Postgres database and makes it easy to construct efficient queries for AI use cases, like managing a document collection for RAG, or building a chatbot. All of the ML & AI still happens inside the database, with centralized operations, hardware and dependency management.
-We provide a client SDK for JavaScript, Python and Rust. The SDK manages connections to the Postgres database and makes it easy to construct efficient queries for AI use cases, like managing a document collection for RAG, or building a chatbot. All of the ML & AI still happenening inside the database, with centralized operations, hardware and dependency management.
+We provide a client SDK for JavaScript, Python and Rust. The SDK manages connections to the Postgres database and makes it easy to construct efficient queries for AI use cases, like managing a document collection for RAG, or building a chatbot. All of the ML & AI still happens inside the database, with centralized operations, hardware and dependency management.
 
-These SDKs are under rapid development to add new features and use cases, but we release non breaking changes with minor version updates in accordance with SemVer. It's easy to install into your existing application.
+The SDK are under rapid development to add new features and use cases, but we release non breaking changes with minor version updates in accordance with SemVer. It's easy to install into your existing application.
-The SDK are under rapid development to add new features and use cases, but we release non breaking changes with minor version updates in accordance with SemVer. It's easy to install into your existing application.
+The SDKs are under rapid development to add new features and use cases, but we release non breaking changes with minor version updates in accordance with SemVer. It's easy to install into your existing application.
-The SDK are under rapid development to add new features and use cases, but we release non breaking changes with minor version updates in accordance with SemVer. It's easy to install into your existing application.
+The SDKs are under rapid development to add new features and use cases, but we release non breaking changes with minor version updates in accordance with SemVer. It's easy to install into your existing application.
 
 ### Installation
 
@@ -28,6 +28,8 @@ pip install pgml
 {% endtab %}
 {% endtabs %}
 
+Our SDK comes with zero additional dependencies. The core of the SDK is written in Rust, and we provide language bindings and native packaging & distribution.
-Our SDK comes with zero additional dependencies. The core of the SDK is written in Rust, and we provide language bindings and native packaging & distribution.
+Our SDK comes with zero additional dependencies, to provide the simplest and safest ML application deployment and maintenance possible. The core of the SDK is written in Rust, and we provide language bindings and native packaging & distribution.
-Our SDK comes with zero additional dependencies. The core of the SDK is written in Rust, and we provide language bindings and native packaging & distribution.
+Our SDK comes with zero additional dependencies, to provide the simplest and safest ML application deployment and maintenance possible. The core of the SDK is written in Rust, and we provide language bindings and native packaging & distribution.
+
 ### Test the connection
 
 {% tabs %}
@@ -80,9 +82,9 @@ async def main():
 {% endtab %}
 {% endtabs %}
 
-## Native Language Bindings
+## Native language bindings
 
-You can also connect directly to the database with your favorite bindings or ORM:
+Using the SDK is optional. If you're comfortable with writing SQL, you can connect directly to the database using your favorite bindings or ORM:
 
 * C++: [libpqxx](https://www.tutorialspoint.com/postgresql/postgresql\_c\_cpp.htm)
 * C#: [Npgsql](https://github.com/npgsql/npgsql),[Dapper](https://github.com/DapperLib/Dapper), or [Entity Framework Core](https://github.com/dotnet/efcore)
@@ -101,9 +103,9 @@ You can also connect directly to the database with your favorite bindings or ORM
 * Rust: [postgres](https://crates.io/crates/postgres), [SQLx](https://github.com/launchbadge/sqlx) or [Diesel](https://github.com/diesel-rs/diesel)
 * Swift: [PostgresNIO](https://github.com/vapor/postgres-nio) or [PostgresClientKit](https://github.com/codewinsdotcom/PostgresClientKit)
 
-## SQL Editors
+## SQL editors
 
-Use any of these popular tools to execute SQL queries directly against the database:
+If you need to write ad-hoc queries, you can use any of these popular tools to execute SQL queries directly on your database:
-If you need to write ad-hoc queries, you can use any of these popular tools to execute SQL queries directly on your database:
+If you need to write ad-hoc queries, or perform administrative functions, you can use any of these popular tools to execute SQL queries directly on your database:
-If you need to write ad-hoc queries, you can use any of these popular tools to execute SQL queries directly on your database:
+If you need to write ad-hoc queries, or perform administrative functions, you can use any of these popular tools to execute SQL queries directly on your database:
 
 * [Apache Superset](https://superset.apache.org/)
 * [DBeaver](https://dbeaver.io/)

diff --git a/pgml-cms/docs/introduction/getting-started/create-your-database.md b/pgml-cms/docs/introduction/getting-started/create-your-database.md
@@ -29,6 +29,6 @@ Click on **Get Started** under the plan of your choice.
 
 ## Your database credentials  <a href="#create-a-new-account" id="create-a-new-account"></a>
 
-We'll automatically provision an initial set of database credentials and provide you with the connection string. You can connect to your database if you have `psql` installed on your machine, or any other PostgreSQL client.
+We'll automatically provision a set of database credentials and provide you with the `DATABASE_URL` connection string. You can connect to your database with `psql`, or any other PostgreSQL client library, or application.
 
 <figure><img src="../../.gitbook/assets/Screenshot from 2023-11-27 23-21-36.png" alt=""><figcaption></figcaption></figure>
diff --git a/pgml-cms/docs/introduction/getting-started/import-your-data/README.md b/pgml-cms/docs/introduction/getting-started/import-your-data/README.md
@@ -1,22 +1,26 @@
 # Import your data
 
-Machine learning always depends on input data, whether it's generating text with pretrained LLMs, training a retention model on customer data, or predicting session abandonment in real time. Just like any PostgreSQL database, PostgresML can be configured as the authoritative application data store, a streaming replica from some other primary, or use foreign data wrappers to query another data host on demand. Depending on how frequently your data changes and where your authoritative data resides, different methodologies imply different tradeoffs.
+AI needs data, whether it's generating text with LLMs, creating embeddings, or training regression or classification models on customer data.
 
-PostgresML can easily ingest data from your existing data stores.
+Just like any PostgreSQL database, PostgresML can be configured as the primary application database, a logical replica of your primary database, or with foreign data wrappers to query your primary database on demand. Depending on how frequently your data changes and your latency requirements, one approach is better than the other.
 
-## Static data
+## Primary database
 
-Data that changes infrequently can be easily imported into PostgresML using `COPY`. All you have to do is export your data as a CSV file, create a table in Postgres to store it, and import it using the command line.
+If you're intention is to use PostgresML as your primary database, your job here is done. You can use the connection credentials provided and start building your application on top of in-database AI right away.
-If you're intention is to use PostgresML as your primary database, your job here is done. You can use the connection credentials provided and start building your application on top of in-database AI right away.
+If your intention is to use PostgresML as your primary database, your job here is done. You can use the connection credentials provided and start building your application on top of in-database AI right away.
-If you're intention is to use PostgresML as your primary database, your job here is done. You can use the connection credentials provided and start building your application on top of in-database AI right away.
+If your intention is to use PostgresML as your primary database, your job here is done. You can use the connection credentials provided and start building your application on top of in-database AI right away.
 
-{% content-ref url="csv.md" %}
-[csv.md](csv.md)
-{% endcontent-ref %}
+## [Logical replica](logical-replication/)
 
-## Live data
+If your primary database is hosted elsewhere, for example AWS RDS, or Azure Postgres, you can get your data replicated to PostgresML in real time using logical replication. 
 
-Importing data from online databases can be done with foreign data wrappers. Hosted PostgresML databases come with both `postgres_fdw` and `dblink` extensions pre-installed, so you can import data from any of your existing Postgres databases, and export machine learning artifacts from PostgresML using just a few lines of SQL.
+<figure><img src="../../../.gitbook/assets/logical_replication_1.png" alt=""><figcaption></figcaption></figure>
 
-{% content-ref url="foreign-data-wrapper.md" %}
-[foreign-data-wrapper.md](foreign-data-wrapper.md)
-{% endcontent-ref %}
+Having access to your data immediately is very useful to
+accelerate your machine learning use cases and removes the need for moving data multiple times between microservices. Latency-sensitive applications should consider using this approach.
 
+## [Foreign data wrappers](foreign-data-wrappers)
+
+Foreign data wrappers are a set of PostgreSQL extensions that allow making direct connections from inside the database directly to other databases, even if they aren't running on Postgres. For example, Postgres has foreign data wrappers for MySQL, S3, Snowflake and many others.
+
+<figure><img src="../../../.gitbook/assets/fdw_1.png" alt=""><figcaption></figcaption></figure>
+
+FDWs are useful when data access is infrequent and not latency-sensitive. For many use cases, like offline batch workloads and not very busy websites, this approach is suitable and easy to get started with.
diff --git a/pgml-cms/docs/introduction/getting-started/import-your-data/copy.md b/pgml-cms/docs/introduction/getting-started/import-your-data/copy.md
@@ -0,0 +1,75 @@
+# COPY
+
+Data that changes infrequently can be easily imported into PostgresML (and any other Postgres database) using `COPY`. All you have to do is export your data as a file, create a table in Postgres to store it, and import it using the command line (or your IDE of choice).
+
+## Getting started
+
+We'll be using CSV as our data format of choice. CSV is a supported mechanism for data transport in pretty much every database and system in existence, so you won't have any trouble finding the CSV export functionality in your current data store.
+
+Let's use a simple CSV file with 3 columns as an example:
+
+| Column           | Data type | Example data |
+| ---------------- | --------- | ------- |
+| name             | text      | John    |
+| age              | integer   | 30      |
+| is\_paying\_user | boolean   | true    |
+
+### Export data
+
+If you're using a Postgres database already, you can export any table as CSV with just one command:
+
+```bash
+psql \
+  postgres://user:password@your-production-db.amazonaws.com \
+  -c "\copy (SELECT * FROM users) TO '~/users.csv' CSV HEADER"
+```
+
+If you're using another data store, it will almost always provide a CSV export functionality.
+
+### Create table in PostgresML
+
+Create a table in PostgresML with the correct schema:
+
+
+{% tabs %}
+{% tab title="SQL" %}
+
+```postgresql
+CREATE TABLE users(
+  name TEXT,
+  age INTEGER,
+  is_paying_user BOOLEAN
+);
+```
+
+{% endtab %}
+{% tab title="Output" %}
+
+```
+CREATE TABLE
+```
+
+{% endtab %}
+{% endtabs %}
+
+Data types should roughly match to what you have in your CSV file. If the data type is not known, you can always use `TEXT` and figure out what it is later with a few queries. Postgres also supports converting data types, as long as they are formatted correctly.
+
+### Import data
+
+Once you have a table and your data exported as CSV, importing it can also be done with just one command:
+
+```bash
+psql \
+ postgres://user:password@sql.cloud.postgresml.org/your_pgml_database \
+ -c "\copy your_table FROM '~/your_table.csv' CSV HEADER"
+```
+
+We took our export command and changed `TO` to `FROM`, and that's it. Make sure you're connecting to your PostgresML database when importing data.
+
+## Refresh data
+
+If your data changed, repeat this process again. To avoid duplicate entries in your table, you can truncate (or delete) all rows beforehand:
+
+```sql
+TRUNCATE your_table;
+```
diff --git a/pgml-cms/docs/introduction/getting-started/import-your-data/csv.md b/pgml-cms/docs/introduction/getting-started/import-your-data/csv.md