diff --git a/.cursor b/.cursor index 80f2889..735d2bf 160000 --- a/.cursor +++ b/.cursor @@ -1 +1 @@ -Subproject commit 80f2889769e32f560029ac5ed62baca45b67fcbd +Subproject commit 735d2bfaffdf25528dd53cc1ccc782bd4979852c diff --git a/.gitignore b/.gitignore index 3e3d452..ced3b08 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,14 @@ pids *.seed *.pid.lock +# Python artifacts +__pycache__/ +*.py[cod] + +# Python virtual environments +.venv/ +venv/ + # Node artifacts node_modules/ cli/node_modules/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c112146..8237a03 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,5 +1,25 @@ stages: - test + - publish + +reporter:tests: + stage: test + image: python:3.11-bullseye + variables: + GIT_STRATEGY: fetch + PIP_DISABLE_PIP_VERSION_CHECK: "1" + PIP_NO_CACHE_DIR: "1" + before_script: + - python --version + - pip install --upgrade pip + - apt-get update + - apt-get install -y --no-install-recommends postgresql postgresql-client && rm -rf /var/lib/apt/lists/* + - pip install -r reporter/requirements-dev.txt + script: + - chown -R postgres:postgres "$CI_PROJECT_DIR" + - su - postgres -c "cd \"$CI_PROJECT_DIR\" && python -m pytest --run-integration tests/reporter" + rules: + - if: '$CI_COMMIT_BRANCH' cli:smoke:test: stage: test @@ -27,6 +47,7 @@ cli:e2e:dind: variables: DOCKER_HOST: tcp://docker:2375 DOCKER_TLS_CERTDIR: "" + DOCKER_API_VERSION: "1.43" GIT_STRATEGY: fetch before_script: - apk add --no-cache bash curl git coreutils docker-cli docker-compose openssl @@ -75,7 +96,7 @@ cli:node:smoke: - npm install -g ./cli - echo "prefix=$(npm config get prefix)" && echo "PATH=$PATH" - command -v postgres-ai && postgres-ai --help - - command -v pgai && pgai --help + - command -v postgresai && postgresai --help - rm -f .pgwatch-config - node ./cli/dist/bin/postgres-ai.js add-key "test_key_1234567890" - node ./cli/dist/bin/postgres-ai.js show-key | grep -E "\*{2,}|[0-9]{4}$" @@ -89,6 +110,104 @@ cli:node:smoke: rules: - if: '$CI_COMMIT_BRANCH' +cli:node:tests: + stage: test + image: node:20-bullseye + variables: + GIT_STRATEGY: fetch + NPM_CONFIG_AUDIT: "false" + NPM_CONFIG_FUND: "false" + before_script: + - corepack enable || true + - apt-get update + - apt-get install -y --no-install-recommends postgresql postgresql-client && rm -rf /var/lib/apt/lists/* + # initdb refuses to run as root; run CLI tests as an unprivileged user + - useradd -m -s /bin/bash pgtest || true + - chown -R pgtest:pgtest "$CI_PROJECT_DIR" + - su - pgtest -c "cd \"$CI_PROJECT_DIR\" && node -v && npm -v && npm --prefix cli ci" + script: + - su - pgtest -c "cd \"$CI_PROJECT_DIR\" && npm --prefix cli test" + rules: + - if: '$CI_COMMIT_BRANCH' + +cli:npm:publish: + stage: publish + image: node:20-bullseye + variables: + GIT_STRATEGY: fetch + NPM_CONFIG_AUDIT: "false" + NPM_CONFIG_FUND: "false" + before_script: + - corepack enable || true + - node -v && npm -v + script: + - | + set -euo pipefail + : "${NPM_TOKEN:?NPM_TOKEN is required to publish}" + + # Supported tag formats (examples): + # - v0.14.0-dev.1 + # - 0.14.0-dev.1 + # - 0.14-dev.1 (normalized to 0.14.0-dev.1) + # - 0.14.0-rc.1 + # - v0.14.0 (stable, published to latest) + RAW_TAG="${CI_COMMIT_TAG:-${CI_COMMIT_REF_NAME:-}}" + if [ -z "$RAW_TAG" ]; then + echo "CI_COMMIT_TAG is empty" + exit 1 + fi + + TAG_VERSION="$RAW_TAG" + TAG_VERSION="${TAG_VERSION#v}" + + # Normalize 0.14-dev.1 -> 0.14.0-dev.1 (also for beta/rc) + if printf '%s' "$TAG_VERSION" | grep -Eq '^[0-9]+\.[0-9]+-(dev|beta|rc)\.[0-9]+$'; then + TAG_VERSION="$(printf '%s' "$TAG_VERSION" | sed -E 's/^([0-9]+\.[0-9]+)-((dev|beta|rc)\.[0-9]+)$/\1.0-\2/')" + fi + + DIST_TAG="" + if printf '%s' "$TAG_VERSION" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+-(dev|beta|rc)\.[0-9]+$'; then + DIST_TAG="$(printf '%s' "$TAG_VERSION" | sed -E 's/^.*-(dev|beta|rc)\.[0-9]+$/\1/')" + elif printf '%s' "$TAG_VERSION" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+$'; then + DIST_TAG="latest" + fi + + if [ "$DIST_TAG" != "dev" ] && [ "$DIST_TAG" != "beta" ] && [ "$DIST_TAG" != "rc" ] && [ "$DIST_TAG" != "latest" ]; then + echo "Unsupported tag/version: $TAG_VERSION (expected X.Y.Z, X.Y.Z-dev.N, X.Y.Z-beta.N, or X.Y.Z-rc.N)" + exit 1 + fi + + # Configure npm auth without committing credentials + printf "//registry.npmjs.org/:_authToken=%s\n" "$NPM_TOKEN" > ~/.npmrc + + echo "Publishing postgresai@${TAG_VERSION} to dist-tag ${DIST_TAG}" + cd cli + npm ci --no-audit --no-fund + # Make the git tag the source of truth without committing version bumps. + npm version --no-git-tag-version "$TAG_VERSION" + npm run build + npm publish --tag "$DIST_TAG" --access public + + echo "Publishing pgai@${TAG_VERSION} (wrapper) to dist-tag ${DIST_TAG}" + cd ../pgai + # Update version + dependency so `npx pgai@` pulls the matching postgresai version. + npm pkg set "dependencies.postgresai=$TAG_VERSION" + npm version --no-git-tag-version "$TAG_VERSION" + # No lockfile: use npm install here. + npm install --no-audit --no-fund + npm publish --tag "$DIST_TAG" --access public + after_script: + - rm -f ~/.npmrc + rules: + # prereleases (also allow patchless X.Y-dev.N, normalized in script) + - if: '$CI_COMMIT_TAG =~ /^v?\d+\.\d+(\.\d+)?-(dev|beta|rc)\.\d+$/' + # stable releases + - if: '$CI_COMMIT_TAG =~ /^v?\d+\.\d+\.\d+$/' + # GitLab "Run pipeline" UI ("web" pipelines) often doesn't populate tag vars consistently. + # Keep this job available manually so the pipeline is never empty; the script validates the ref name. + - if: '$CI_PIPELINE_SOURCE == "web"' + when: manual + cli:node:e2e:dind: stage: test image: node:20-alpine @@ -98,6 +217,7 @@ cli:node:e2e:dind: variables: DOCKER_HOST: tcp://docker:2375 DOCKER_TLS_CERTDIR: "" + DOCKER_API_VERSION: "1.43" GIT_STRATEGY: fetch before_script: - corepack enable || true @@ -120,6 +240,7 @@ cli:node:full:dind: variables: DOCKER_HOST: tcp://docker:2375 DOCKER_TLS_CERTDIR: "" + DOCKER_API_VERSION: "1.43" GIT_STRATEGY: fetch before_script: - corepack enable || true diff --git a/README.md b/README.md index 5ab8bc2..0e49b7a 100644 --- a/README.md +++ b/README.md @@ -86,55 +86,84 @@ Failure to secure these ports may expose sensitive database information! ## πŸš€ Quick start -Create a new DB user in the database to be monitored (skip this if you want to just check out `postgres_ai` monitoring with a synthetic `demo` database): -```sql --- Create a user for postgres_ai monitoring -begin; -create user postgres_ai_mon with password ''; - -grant connect on database to postgres_ai_mon; - -grant pg_monitor to postgres_ai_mon; -grant select on pg_index to postgres_ai_mon; - --- Create a public view for pg_statistic access (optional, for bloat analysis) -create view public.pg_statistic as -select - n.nspname as schemaname, - c.relname as tablename, - a.attname, - s.stanullfrac as null_frac, - s.stawidth as avg_width, - false as inherited -from pg_statistic s -join pg_class c on c.oid = s.starelid -join pg_namespace n on n.oid = c.relnamespace -join pg_attribute a on a.attrelid = s.starelid and a.attnum = s.staattnum -where a.attnum > 0 and not a.attisdropped; - -grant select on public.pg_statistic to postgres_ai_mon; -alter user postgres_ai_mon set search_path = "$user", public, pg_catalog; -commit; +Create a database user for monitoring (skip this if you want to just check out `postgres_ai` monitoring with a synthetic `demo` database). + +Use the CLI to create/update the monitoring role and grant all required permissions (idempotent): + +```bash +# Connect as an admin/superuser and apply required permissions. +# Admin password comes from PGPASSWORD (libpq standard) unless you pass --admin-password. +# +# Monitoring password: +# - by default, postgresai generates a strong password automatically +# - it is printed only in interactive (TTY) mode, or if you opt in via --print-password +PGPASSWORD='...' npx postgresai init postgresql://admin@host:5432/dbname +``` + +Optional permissions (RDS/self-managed extras) are enabled by default. To skip them: + +```bash +PGPASSWORD='...' npx postgresai init postgresql://admin@host:5432/dbname --skip-optional-permissions +``` + +Verify everything is in place (no changes): + +```bash +PGPASSWORD='...' npx postgresai init postgresql://admin@host:5432/dbname --verify +``` + +If you want to reset the monitoring password only (no other changes), you can rely on auto-generation: + +```bash +PGPASSWORD='...' npx postgresai init postgresql://admin@host:5432/dbname --reset-password ``` -### Optional permissions to analyze risks of certain performance cliffs +By default, `postgresai init` auto-generates a strong password (see above). -For RDS Postgres and Aurora: +If you want to set a specific password instead: -```sql -create extension if not exists rds_tools; -grant execute on function rds_tools.pg_ls_multixactdir() to postgres_ai_mon; +```bash +PGPASSWORD='...' npx postgresai init postgresql://admin@host:5432/dbname --reset-password --password 'new_password' ``` -For self-managed Postgres: +If you want to see what will be executed first, use `--print-sql` (prints the SQL plan and exits; passwords redacted by default). This can be done without a DB connection: + +```bash +npx postgresai init --print-sql +``` + +Optionally, to render the plan for a specific database and/or show the password literal: + +```bash +# Pick database (default is PGDATABASE or "postgres"): +npx postgresai init --print-sql -d dbname + +# Provide an explicit monitoring password (still redacted unless you opt in): +npx postgresai init --print-sql -d dbname --password '...' -```sql -grant execute on function pg_stat_file(text) to postgres_ai_mon; -grant execute on function pg_stat_file(text, boolean) to postgres_ai_mon; -grant execute on function pg_ls_dir(text) to postgres_ai_mon; -grant execute on function pg_ls_dir(text, boolean, boolean) to postgres_ai_mon; +# Dangerous: print secrets in the SQL output: +npx postgresai init --print-sql -d dbname --password '...' --show-secrets ``` +### Troubleshooting + +**Permission denied errors** + +If you see errors like `permission denied` / `insufficient_privilege` / code `42501`, you are not connected with enough privileges to create roles, grant permissions, or create extensions/views. + +- **How to fix**: + - Connect as a **superuser**, or a role with **CREATEROLE** and sufficient **GRANT/DDL** privileges + - On RDS/Aurora: use a user with the `rds_superuser` role (typically `postgres`, the most highly privileged user on RDS for PostgreSQL) + - On Cloud SQL: use a user with the `cloudsqlsuperuser` role (often `postgres`) + - On Supabase: use the `postgres` user (default administrator with elevated privileges for role/permission management) + - On managed providers: use the provider’s **admin** role/user + +- **Review SQL before running** (audit-friendly): + + ```bash + npx postgresai init --print-sql -d mydb --password '...' --show-secrets + ``` + **One command setup:** ```bash @@ -202,6 +231,52 @@ Get a complete monitoring setup with demo data in under 2 minutes. ./postgres_ai health ``` +## πŸ“‹ Checkup reports + +postgres_ai monitoring generates automated health check reports based on [postgres-checkup](https://gitlab.com/postgres-ai/postgres-checkup). Each report has a unique check ID and title: + +### A. General / Infrastructural +| Check ID | Title | +|----------|-------| +| A001 | System information | +| A002 | Version information | +| A003 | Postgres settings | +| A004 | Cluster information | +| A005 | Extensions | +| A006 | Postgres setting deviations | +| A007 | Altered settings | +| A008 | Disk usage and file system type | + +### D. Monitoring / Troubleshooting +| Check ID | Title | +|----------|-------| +| D004 | pg_stat_statements and pg_stat_kcache settings | + +### F. Autovacuum, Bloat +| Check ID | Title | +|----------|-------| +| F001 | Autovacuum: current settings | +| F004 | Autovacuum: heap bloat (estimated) | +| F005 | Autovacuum: index bloat (estimated) | + +### G. Performance / Connections / Memory-related settings +| Check ID | Title | +|----------|-------| +| G001 | Memory-related settings | + +### H. Index analysis +| Check ID | Title | +|----------|-------| +| H001 | Invalid indexes | +| H002 | Unused indexes | +| H004 | Redundant indexes | + +### K. SQL query analysis +| Check ID | Title | +|----------|-------| +| K001 | Globally aggregated query metrics | +| K003 | Top-50 queries by total_time | + ## 🌐 Access points After running quickstart: @@ -229,12 +304,12 @@ node ./cli/bin/postgres-ai.js --help npm --prefix cli install --no-audit --no-fund npm link ./cli postgres-ai --help -pgai --help +postgresai --help # or install globally after publish (planned) # npm i -g @postgresai/cli # postgres-ai --help -# pgai --help +# postgresai --help ``` ## πŸ”‘ PostgresAI access token @@ -248,6 +323,54 @@ Get your access token at [PostgresAI](https://postgres.ai) for automated report - Query plan analysis and automated recommendations - Enhanced AI integration capabilities +## πŸ§ͺ Testing + +Python-based report generation lives under `reporter/` and now ships with a pytest suite. + +### Installation + +Install dev dependencies (includes `pytest`, `pytest-postgresql`, `psycopg`, etc.): +```bash +python3 -m pip install -r reporter/requirements-dev.txt +``` + +### Running Tests + +#### Unit Tests Only (Fast, No External Services Required) + +Run only unit tests with mocked Prometheus interactions: +```bash +pytest tests/reporter +``` + +This automatically skips integration tests. Or run specific test files: +```bash +pytest tests/reporter/test_generators_unit.py -v +pytest tests/reporter/test_formatters.py -v +``` + +#### All Tests: Unit + Integration (Requires PostgreSQL) + +Run the complete test suite (both unit and integration tests): +```bash +pytest tests/reporter --run-integration +``` + +Integration tests create a temporary PostgreSQL instance automatically and require PostgreSQL binaries (`initdb`, `postgres`) on your PATH. No manual database setup or environment variables are required - the tests create and destroy their own temporary PostgreSQL instances. + +**Summary:** +- `pytest tests/reporter` β†’ **Unit tests only** (integration tests skipped) +- `pytest tests/reporter --run-integration` β†’ **Both unit and integration tests** + +### Test Coverage + +Generate coverage report: +```bash +pytest tests/reporter -m unit --cov=reporter --cov-report=html +``` + +View the coverage report by opening `htmlcov/index.html` in your browser. + ## 🀝 Contributing We welcome contributions from Postgres experts! Please check our [GitLab repository](https://gitlab.com/postgres-ai/postgres_ai) for: diff --git a/cli/README.md b/cli/README.md index c1eeb72..49cc78e 100644 --- a/cli/README.md +++ b/cli/README.md @@ -10,11 +10,13 @@ Command-line interface for PostgresAI monitoring and database management. npm install -g postgresai ``` -Or install the latest alpha release explicitly: +Or install the latest beta release explicitly: ```bash -npm install -g postgresai@alpha +npm install -g postgresai@beta ``` +Note: in this repository, `cli/package.json` uses a placeholder version (`0.0.0-dev.0`). The real published version is set by the git tag in CI when publishing to npm. + ### From Homebrew (macOS) ```bash @@ -31,7 +33,70 @@ The CLI provides three command aliases: ```bash postgres-ai --help postgresai --help -pgai --help # short alias +``` + +You can also run it without installing via `npx`: + +```bash +npx postgresai --help +``` + +## init (create monitoring user in Postgres) + +This command creates (or updates) the `postgres_ai_mon` user and grants the permissions described in the root `README.md` (it is idempotent). + +Run without installing (positional connection string): + +```bash +npx postgresai init postgresql://admin@host:5432/dbname +``` + +It also accepts libpq β€œconninfo” syntax: + +```bash +npx postgresai init "dbname=dbname host=host user=admin" +``` + +And psql-like options: + +```bash +npx postgresai init -h host -p 5432 -U admin -d dbname +``` + +Password input options (in priority order): +- `--password ` +- `PGAI_MON_PASSWORD` environment variable +- if not provided: a strong password is generated automatically + +By default, the generated password is printed **only in interactive (TTY) mode**. In non-interactive mode, you must either provide the password explicitly, or opt-in to printing it: +- `--print-password` (dangerous in CI logs) + +Optional permissions (RDS/self-managed extras from the root `README.md`) are enabled by default. To skip them: + +```bash +npx postgresai init postgresql://admin@host:5432/dbname --skip-optional-permissions +``` + +### Print SQL / dry run + +To see what SQL would be executed (passwords redacted by default): + +```bash +npx postgresai init postgresql://admin@host:5432/dbname --print-sql +``` + +### Verify and password reset + +Verify that everything is configured as expected (no changes): + +```bash +npx postgresai init postgresql://admin@host:5432/dbname --verify +``` + +Reset monitoring user password only (no other changes): + +```bash +npx postgresai init postgresql://admin@host:5432/dbname --reset-password --password 'new_password' ``` ## Quick start @@ -40,7 +105,7 @@ pgai --help # short alias Authenticate via browser to obtain API key: ```bash -pgai auth +postgresai auth ``` This will: @@ -122,7 +187,7 @@ postgres-ai mon shell # Open shell to monitoring servic ### MCP server (`mcp` group) ```bash -pgai mcp start # Start MCP stdio server exposing tools +postgresai mcp start # Start MCP stdio server exposing tools ``` Cursor configuration example (Settings β†’ MCP): @@ -131,7 +196,7 @@ Cursor configuration example (Settings β†’ MCP): { "mcpServers": { "PostgresAI": { - "command": "pgai", + "command": "postgresai", "args": ["mcp", "start"], "env": { "PGAI_API_BASE_URL": "https://postgres.ai/api/general/" @@ -142,16 +207,16 @@ Cursor configuration example (Settings β†’ MCP): ``` Tools exposed: -- list_issues: returns the same JSON as `pgai issues list`. +- list_issues: returns the same JSON as `postgresai issues list`. - view_issue: view a single issue with its comments (args: { issue_id, debug? }) - post_issue_comment: post a comment (args: { issue_id, content, parent_comment_id?, debug? }) ### Issues management (`issues` group) ```bash -pgai issues list # List issues (shows: id, title, status, created_at) -pgai issues view # View issue details and comments -pgai issues post_comment # Post a comment to an issue +postgresai issues list # List issues (shows: id, title, status, created_at) +postgresai issues view # View issue details and comments +postgresai issues post_comment # Post a comment to an issue # Options: # --parent Parent comment ID (for replies) # --debug Enable debug output @@ -165,13 +230,13 @@ By default, issues commands print human-friendly YAML when writing to a terminal - Use `--json` to force JSON output: ```bash -pgai issues list --json | jq '.[] | {id, title}' +postgresai issues list --json | jq '.[] | {id, title}' ``` - Rely on auto-detection: when stdout is not a TTY (e.g., piped or redirected), output is JSON automatically: ```bash -pgai issues view > issue.json +postgresai issues view > issue.json ``` #### Grafana management @@ -235,7 +300,7 @@ Linux/macOS (bash/zsh): ```bash export PGAI_API_BASE_URL=https://v2.postgres.ai/api/general/ export PGAI_UI_BASE_URL=https://console-dev.postgres.ai -pgai auth --debug +postgresai auth --debug ``` Windows PowerShell: @@ -243,13 +308,13 @@ Windows PowerShell: ```powershell $env:PGAI_API_BASE_URL = "https://v2.postgres.ai/api/general/" $env:PGAI_UI_BASE_URL = "https://console-dev.postgres.ai" -pgai auth --debug +postgresai auth --debug ``` Via CLI options (overrides env): ```bash -pgai auth --debug \ +postgresai auth --debug \ --api-base-url https://v2.postgres.ai/api/general/ \ --ui-base-url https://console-dev.postgres.ai ``` diff --git a/cli/bin/postgres-ai.ts b/cli/bin/postgres-ai.ts index a408d50..500212d 100644 --- a/cli/bin/postgres-ai.ts +++ b/cli/bin/postgres-ai.ts @@ -12,9 +12,11 @@ import { promisify } from "util"; import * as readline from "readline"; import * as http from "https"; import { URL } from "url"; +import { Client } from "pg"; import { startMcpServer } from "../lib/mcp-server"; import { fetchIssues, fetchIssueComments, createIssueComment, fetchIssue } from "../lib/issues"; import { resolveBaseUrls } from "../lib/util"; +import { applyInitPlan, buildInitPlan, DEFAULT_MONITORING_USER, redactPasswordsInSql, resolveAdminConnection, resolveMonitoringPassword, verifyInitSetup } from "../lib/init"; const execPromise = promisify(exec); const execFilePromise = promisify(execFile); @@ -116,6 +118,337 @@ program "UI base URL for browser routes (overrides PGAI_UI_BASE_URL)" ); +program + .command("init [conn]") + .description("Create a monitoring user and grant all required permissions (idempotent)") + .option("--db-url ", "PostgreSQL connection URL (admin) to run the setup against (deprecated; pass it as positional arg)") + .option("-h, --host ", "PostgreSQL host (psql-like)") + .option("-p, --port ", "PostgreSQL port (psql-like)") + .option("-U, --username ", "PostgreSQL user (psql-like)") + .option("-d, --dbname ", "PostgreSQL database name (psql-like)") + .option("--admin-password ", "Admin connection password (otherwise uses PGPASSWORD if set)") + .option("--monitoring-user ", "Monitoring role name to create/update", DEFAULT_MONITORING_USER) + .option("--password ", "Monitoring role password (overrides PGAI_MON_PASSWORD)") + .option("--skip-optional-permissions", "Skip optional permissions (RDS/self-managed extras)", false) + .option("--verify", "Verify that monitoring role/permissions are in place (no changes)", false) + .option("--reset-password", "Reset monitoring role password only (no other changes)", false) + .option("--print-sql", "Print SQL plan and exit (no changes applied)", false) + .option("--show-secrets", "When printing SQL, do not redact secrets (DANGEROUS)", false) + .option("--print-password", "Print generated monitoring password (DANGEROUS in CI logs)", false) + .addHelpText( + "after", + [ + "", + "Examples:", + " postgresai init postgresql://admin@host:5432/dbname", + " postgresai init \"dbname=dbname host=host user=admin\"", + " postgresai init -h host -p 5432 -U admin -d dbname", + "", + "Admin password:", + " --admin-password or PGPASSWORD=... (libpq standard)", + "", + "Monitoring password:", + " --password or PGAI_MON_PASSWORD=... (otherwise auto-generated)", + " If auto-generated, it is printed only on TTY by default.", + " To print it in non-interactive mode: --print-password", + "", + "Environment variables (libpq standard):", + " PGHOST, PGPORT, PGUSER, PGDATABASE β€” connection defaults", + " PGPASSWORD β€” admin password", + " PGAI_MON_PASSWORD β€” monitoring password", + "", + "Inspect SQL without applying changes:", + " postgresai init --print-sql", + "", + "Verify setup (no changes):", + " postgresai init --verify", + "", + "Reset monitoring password only:", + " postgresai init --reset-password --password '...'", + "", + "Offline SQL plan (no DB connection):", + " postgresai init --print-sql -d dbname --password '...' --show-secrets", + ].join("\n") + ) + .action(async (conn: string | undefined, opts: { + dbUrl?: string; + host?: string; + port?: string; + username?: string; + dbname?: string; + adminPassword?: string; + monitoringUser: string; + password?: string; + skipOptionalPermissions?: boolean; + verify?: boolean; + resetPassword?: boolean; + printSql?: boolean; + showSecrets?: boolean; + printPassword?: boolean; + }, cmd: Command) => { + if (opts.verify && opts.resetPassword) { + console.error("βœ— Provide only one of --verify or --reset-password"); + process.exitCode = 1; + return; + } + if (opts.verify && opts.printSql) { + console.error("βœ— --verify cannot be combined with --print-sql"); + process.exitCode = 1; + return; + } + + const shouldPrintSql = !!opts.printSql; + const shouldRedactSecrets = !opts.showSecrets; + const redactPasswords = (sql: string): string => { + if (!shouldRedactSecrets) return sql; + // Replace PASSWORD '' (handles doubled quotes inside). + return redactPasswordsInSql(sql); + }; + + // Offline mode: allow printing SQL without providing/using an admin connection. + // Useful for audits/reviews; caller can provide -d/PGDATABASE and an explicit monitoring password. + if (!conn && !opts.dbUrl && !opts.host && !opts.port && !opts.username && !opts.adminPassword) { + if (shouldPrintSql) { + const database = (opts.dbname ?? process.env.PGDATABASE ?? "postgres").trim(); + const includeOptionalPermissions = !opts.skipOptionalPermissions; + + // Use explicit password/env if provided; otherwise use a placeholder (will be redacted unless --show-secrets). + const monPassword = + (opts.password ?? process.env.PGAI_MON_PASSWORD ?? "CHANGE_ME").toString(); + + const plan = await buildInitPlan({ + database, + monitoringUser: opts.monitoringUser, + monitoringPassword: monPassword, + includeOptionalPermissions, + }); + + console.log("\n--- SQL plan (offline; not connected) ---"); + console.log(`-- database: ${database}`); + console.log(`-- monitoring user: ${opts.monitoringUser}`); + console.log(`-- optional permissions: ${includeOptionalPermissions ? "enabled" : "skipped"}`); + for (const step of plan.steps) { + console.log(`\n-- ${step.name}${step.optional ? " (optional)" : ""}`); + console.log(redactPasswords(step.sql)); + } + console.log("\n--- end SQL plan ---\n"); + if (shouldRedactSecrets) { + console.log("Note: passwords are redacted in the printed SQL (use --show-secrets to print them)."); + } + return; + } + } + + let adminConn; + try { + adminConn = resolveAdminConnection({ + conn, + dbUrlFlag: opts.dbUrl, + // Allow libpq standard env vars as implicit defaults (common UX). + host: opts.host ?? process.env.PGHOST, + port: opts.port ?? process.env.PGPORT, + username: opts.username ?? process.env.PGUSER, + dbname: opts.dbname ?? process.env.PGDATABASE, + adminPassword: opts.adminPassword, + envPassword: process.env.PGPASSWORD, + }); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + console.error(`Error: init: ${msg}`); + // When connection details are missing, show full init help (options + examples). + if (typeof msg === "string" && msg.startsWith("Connection is required.")) { + console.error(""); + cmd.outputHelp({ error: true }); + } + process.exitCode = 1; + return; + } + + const includeOptionalPermissions = !opts.skipOptionalPermissions; + + console.log(`Connecting to: ${adminConn.display}`); + console.log(`Monitoring user: ${opts.monitoringUser}`); + console.log(`Optional permissions: ${includeOptionalPermissions ? "enabled" : "skipped"}`); + + // Use native pg client instead of requiring psql to be installed + let client: Client | undefined; + try { + client = new Client(adminConn.clientConfig); + await client.connect(); + + const dbRes = await client.query("select current_database() as db"); + const database = dbRes.rows?.[0]?.db; + if (typeof database !== "string" || !database) { + throw new Error("Failed to resolve current database name"); + } + + if (opts.verify) { + const v = await verifyInitSetup({ + client, + database, + monitoringUser: opts.monitoringUser, + includeOptionalPermissions, + }); + if (v.ok) { + console.log("βœ“ init verify: OK"); + if (v.missingOptional.length > 0) { + console.log("⚠ Optional items missing:"); + for (const m of v.missingOptional) console.log(`- ${m}`); + } + return; + } + console.error("βœ— init verify failed: missing required items"); + for (const m of v.missingRequired) console.error(`- ${m}`); + if (v.missingOptional.length > 0) { + console.error("Optional items missing:"); + for (const m of v.missingOptional) console.error(`- ${m}`); + } + process.exitCode = 1; + return; + } + + let monPassword: string; + try { + const resolved = await resolveMonitoringPassword({ + passwordFlag: opts.password, + passwordEnv: process.env.PGAI_MON_PASSWORD, + monitoringUser: opts.monitoringUser, + }); + monPassword = resolved.password; + if (resolved.generated) { + const canPrint = process.stdout.isTTY || !!opts.printPassword; + if (canPrint) { + // Print secrets to stderr to reduce the chance they end up in piped stdout logs. + const shellSafe = monPassword.replace(/'/g, "'\\''"); + console.error(""); + console.error(`Generated monitoring password for ${opts.monitoringUser} (copy/paste):`); + // Quote for shell copy/paste safety. + console.error(`PGAI_MON_PASSWORD='${shellSafe}'`); + console.error(""); + console.log("Store it securely (or rerun with --password / PGAI_MON_PASSWORD to set your own)."); + } else { + console.error( + [ + `βœ— Monitoring password was auto-generated for ${opts.monitoringUser} but not printed in non-interactive mode.`, + "", + "Provide it explicitly:", + " --password or PGAI_MON_PASSWORD=...", + "", + "Or (NOT recommended) print the generated password:", + " --print-password", + ].join("\n") + ); + process.exitCode = 1; + return; + } + } + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + console.error(`βœ— ${msg}`); + process.exitCode = 1; + return; + } + + const plan = await buildInitPlan({ + database, + monitoringUser: opts.monitoringUser, + monitoringPassword: monPassword, + includeOptionalPermissions, + }); + + const effectivePlan = opts.resetPassword + ? { ...plan, steps: plan.steps.filter((s) => s.name === "01.role") } + : plan; + + if (shouldPrintSql) { + console.log("\n--- SQL plan ---"); + for (const step of effectivePlan.steps) { + console.log(`\n-- ${step.name}${step.optional ? " (optional)" : ""}`); + console.log(redactPasswords(step.sql)); + } + console.log("\n--- end SQL plan ---\n"); + if (shouldRedactSecrets) { + console.log("Note: passwords are redacted in the printed SQL (use --show-secrets to print them)."); + } + return; + } + + const { applied, skippedOptional } = await applyInitPlan({ client, plan: effectivePlan }); + + console.log(opts.resetPassword ? "βœ“ init password reset completed" : "βœ“ init completed"); + if (skippedOptional.length > 0) { + console.log("⚠ Some optional steps were skipped (not supported or insufficient privileges):"); + for (const s of skippedOptional) console.log(`- ${s}`); + } + // Keep output compact but still useful + if (process.stdout.isTTY) { + console.log(`Applied ${applied.length} steps`); + } + } catch (error) { + const errAny = error as any; + let message = ""; + if (error instanceof Error && error.message) { + message = error.message; + } else if (errAny && typeof errAny === "object" && typeof errAny.message === "string" && errAny.message) { + message = errAny.message; + } else { + message = String(error); + } + if (!message || message === "[object Object]") { + message = "Unknown error"; + } + console.error(`Error: init: ${message}`); + // If this was a plan step failure, surface the step name explicitly to help users diagnose quickly. + const stepMatch = + typeof message === "string" ? message.match(/Failed at step "([^"]+)":/i) : null; + const failedStep = stepMatch?.[1]; + if (failedStep) { + console.error(` Step: ${failedStep}`); + } + if (errAny && typeof errAny === "object") { + if (typeof errAny.code === "string" && errAny.code) { + console.error(` Code: ${errAny.code}`); + } + if (typeof errAny.detail === "string" && errAny.detail) { + console.error(` Detail: ${errAny.detail}`); + } + if (typeof errAny.hint === "string" && errAny.hint) { + console.error(` Hint: ${errAny.hint}`); + } + } + if (errAny && typeof errAny === "object" && typeof errAny.code === "string") { + if (errAny.code === "42501") { + if (failedStep === "01.role") { + console.error(" Context: role creation/update requires CREATEROLE or superuser"); + } else if (failedStep === "02.permissions") { + console.error(" Context: grants/view/search_path require sufficient GRANT/DDL privileges"); + } + console.error(" Fix: connect as a superuser (or a role with CREATEROLE and sufficient GRANT privileges)"); + console.error(" Fix: on managed Postgres, use the provider's admin/master user"); + console.error(" Tip: run with --print-sql to review the exact SQL plan"); + } + if (errAny.code === "ECONNREFUSED") { + console.error(" Hint: check host/port and ensure Postgres is reachable from this machine"); + } + if (errAny.code === "ENOTFOUND") { + console.error(" Hint: DNS resolution failed; double-check the host name"); + } + if (errAny.code === "ETIMEDOUT") { + console.error(" Hint: connection timed out; check network/firewall rules"); + } + } + process.exitCode = 1; + } finally { + if (client) { + try { + await client.end(); + } catch { + // ignore + } + } + } + }); + /** * Stub function for not implemented commands */ diff --git a/cli/lib/init.ts b/cli/lib/init.ts new file mode 100644 index 0000000..66e4be5 --- /dev/null +++ b/cli/lib/init.ts @@ -0,0 +1,629 @@ +import * as readline from "readline"; +import { randomBytes } from "crypto"; +import { URL } from "url"; +import type { ConnectionOptions as TlsConnectionOptions } from "tls"; +import type { Client as PgClient } from "pg"; +import * as fs from "fs"; +import * as path from "path"; + +export const DEFAULT_MONITORING_USER = "postgres_ai_mon"; + +export type PgClientConfig = { + connectionString?: string; + host?: string; + port?: number; + user?: string; + password?: string; + database?: string; + ssl?: boolean | TlsConnectionOptions; +}; + +export type AdminConnection = { + clientConfig: PgClientConfig; + display: string; +}; + +export type InitStep = { + name: string; + sql: string; + params?: unknown[]; + optional?: boolean; +}; + +export type InitPlan = { + monitoringUser: string; + database: string; + steps: InitStep[]; +}; + +function packageRootDirFromCompiled(): string { + // dist/lib/init.js -> /dist/lib ; package root is ../.. + return path.resolve(__dirname, "..", ".."); +} + +function sqlDir(): string { + return path.join(packageRootDirFromCompiled(), "sql"); +} + +function loadSqlTemplate(filename: string): string { + const p = path.join(sqlDir(), filename); + return fs.readFileSync(p, "utf8"); +} + +function applyTemplate(sql: string, vars: Record): string { + return sql.replace(/\{\{([A-Z0-9_]+)\}\}/g, (_, key) => { + const v = vars[key]; + if (v === undefined) throw new Error(`Missing SQL template var: ${key}`); + return v; + }); +} + +function quoteIdent(ident: string): string { + // Always quote. Escape embedded quotes by doubling. + if (ident.includes("\0")) { + throw new Error("Identifier cannot contain null bytes"); + } + return `"${ident.replace(/"/g, "\"\"")}"`; +} + +function quoteLiteral(value: string): string { + // Single-quote and escape embedded quotes by doubling. + // This is used where Postgres grammar requires a literal (e.g., CREATE/ALTER ROLE PASSWORD). + if (value.includes("\0")) { + throw new Error("Literal cannot contain null bytes"); + } + return `'${value.replace(/'/g, "''")}'`; +} + +export function redactPasswordsInSql(sql: string): string { + // Replace PASSWORD '' (handles doubled quotes inside). + return sql.replace(/password\s+'(?:''|[^'])*'/gi, "password ''"); +} + +export function maskConnectionString(dbUrl: string): string { + // Hide password if present (postgresql://user:pass@host/db). + try { + const u = new URL(dbUrl); + if (u.password) u.password = "*****"; + return u.toString(); + } catch { + return dbUrl.replace(/\/\/([^:/?#]+):([^@/?#]+)@/g, "//$1:*****@"); + } +} + +function isLikelyUri(value: string): boolean { + return /^postgres(ql)?:\/\//i.test(value.trim()); +} + +function tokenizeConninfo(input: string): string[] { + const s = input.trim(); + const tokens: string[] = []; + let i = 0; + + const isSpace = (ch: string) => ch === " " || ch === "\t" || ch === "\n" || ch === "\r"; + + while (i < s.length) { + while (i < s.length && isSpace(s[i]!)) i++; + if (i >= s.length) break; + + let tok = ""; + let inSingle = false; + while (i < s.length) { + const ch = s[i]!; + if (!inSingle && isSpace(ch)) break; + + if (ch === "'" && !inSingle) { + inSingle = true; + i++; + continue; + } + if (ch === "'" && inSingle) { + inSingle = false; + i++; + continue; + } + + if (ch === "\\" && i + 1 < s.length) { + tok += s[i + 1]!; + i += 2; + continue; + } + + tok += ch; + i++; + } + + tokens.push(tok); + while (i < s.length && isSpace(s[i]!)) i++; + } + + return tokens; +} + +export function parseLibpqConninfo(input: string): PgClientConfig { + const tokens = tokenizeConninfo(input); + const cfg: PgClientConfig = {}; + + for (const t of tokens) { + const eq = t.indexOf("="); + if (eq <= 0) continue; + const key = t.slice(0, eq).trim(); + const rawVal = t.slice(eq + 1); + const val = rawVal.trim(); + if (!key) continue; + + switch (key) { + case "host": + cfg.host = val; + break; + case "port": { + const p = Number(val); + if (Number.isFinite(p)) cfg.port = p; + break; + } + case "user": + cfg.user = val; + break; + case "password": + cfg.password = val; + break; + case "dbname": + case "database": + cfg.database = val; + break; + // ignore everything else (sslmode, options, application_name, etc.) + default: + break; + } + } + + return cfg; +} + +export function describePgConfig(cfg: PgClientConfig): string { + if (cfg.connectionString) return maskConnectionString(cfg.connectionString); + const user = cfg.user ? cfg.user : ""; + const host = cfg.host ? cfg.host : ""; + const port = cfg.port ? String(cfg.port) : ""; + const db = cfg.database ? cfg.database : ""; + // Don't include password + return `postgresql://${user}:*****@${host}:${port}/${db}`; +} + +export function resolveAdminConnection(opts: { + conn?: string; + dbUrlFlag?: string; + host?: string; + port?: string | number; + username?: string; + dbname?: string; + adminPassword?: string; + envPassword?: string; +}): AdminConnection { + const conn = (opts.conn || "").trim(); + const dbUrlFlag = (opts.dbUrlFlag || "").trim(); + + // NOTE: passwords alone (PGPASSWORD / --admin-password) do NOT constitute a connection. + // We require at least some connection addressing (host/port/user/db) if no positional arg / --db-url is provided. + const hasConnDetails = !!(opts.host || opts.port || opts.username || opts.dbname); + + if (conn && dbUrlFlag) { + throw new Error("Provide either positional connection string or --db-url, not both"); + } + + if (conn || dbUrlFlag) { + const v = conn || dbUrlFlag; + if (isLikelyUri(v)) { + return { clientConfig: { connectionString: v }, display: maskConnectionString(v) }; + } + // libpq conninfo (dbname=... host=...) + const cfg = parseLibpqConninfo(v); + if (opts.envPassword && !cfg.password) cfg.password = opts.envPassword; + return { clientConfig: cfg, display: describePgConfig(cfg) }; + } + + if (!hasConnDetails) { + throw new Error( + [ + "Connection is required.", + "", + "Examples:", + " postgresai init postgresql://admin@host:5432/dbname", + " postgresai init \"dbname=dbname host=host user=admin\"", + " postgresai init -h host -p 5432 -U admin -d dbname", + "", + "Admin password:", + " --admin-password (or set PGPASSWORD)", + ].join("\n") + ); + } + + const cfg: PgClientConfig = {}; + if (opts.host) cfg.host = opts.host; + if (opts.port !== undefined && opts.port !== "") { + const p = Number(opts.port); + if (!Number.isFinite(p) || !Number.isInteger(p) || p <= 0 || p > 65535) { + throw new Error(`Invalid port value: ${String(opts.port)}`); + } + cfg.port = p; + } + if (opts.username) cfg.user = opts.username; + if (opts.dbname) cfg.database = opts.dbname; + if (opts.adminPassword) cfg.password = opts.adminPassword; + if (opts.envPassword && !cfg.password) cfg.password = opts.envPassword; + return { clientConfig: cfg, display: describePgConfig(cfg) }; +} + +export async function promptHidden(prompt: string): Promise { + // Implement our own hidden input reader so: + // - prompt text is visible + // - only user input is masked + // - we don't rely on non-public readline internals + if (!process.stdin.isTTY) { + throw new Error("Cannot prompt for password in non-interactive mode"); + } + + const stdin = process.stdin; + const stdout = process.stdout as NodeJS.WriteStream; + + stdout.write(prompt); + + return await new Promise((resolve, reject) => { + let value = ""; + + const cleanup = () => { + try { + stdin.setRawMode(false); + } catch { + // ignore + } + stdin.removeListener("keypress", onKeypress); + }; + + const onKeypress = (str: string, key: any) => { + if (key?.ctrl && key?.name === "c") { + stdout.write("\n"); + cleanup(); + reject(new Error("Cancelled")); + return; + } + + if (key?.name === "return" || key?.name === "enter") { + stdout.write("\n"); + cleanup(); + resolve(value); + return; + } + + if (key?.name === "backspace") { + if (value.length > 0) { + value = value.slice(0, -1); + // Erase one mask char. + stdout.write("\b \b"); + } + return; + } + + // Ignore other control keys. + if (key?.ctrl || key?.meta) return; + + if (typeof str === "string" && str.length > 0) { + value += str; + stdout.write("*"); + } + }; + + readline.emitKeypressEvents(stdin); + stdin.setRawMode(true); + stdin.on("keypress", onKeypress); + stdin.resume(); + }); +} + +function generateMonitoringPassword(): string { + // URL-safe and easy to copy/paste; 24 bytes => 32 base64url chars (no padding). + // Note: randomBytes() throws on failure; we add a tiny sanity check for unexpected output. + const password = randomBytes(24).toString("base64url"); + if (password.length < 30) { + throw new Error("Password generation failed: unexpected output length"); + } + return password; +} + +export async function resolveMonitoringPassword(opts: { + passwordFlag?: string; + passwordEnv?: string; + prompt?: (prompt: string) => Promise; + monitoringUser: string; +}): Promise<{ password: string; generated: boolean }> { + const fromFlag = (opts.passwordFlag || "").trim(); + if (fromFlag) return { password: fromFlag, generated: false }; + + const fromEnv = (opts.passwordEnv || "").trim(); + if (fromEnv) return { password: fromEnv, generated: false }; + + // Default: auto-generate (safer than prompting; works in non-interactive mode). + return { password: generateMonitoringPassword(), generated: true }; +} + +export async function buildInitPlan(params: { + database: string; + monitoringUser?: string; + monitoringPassword: string; + includeOptionalPermissions: boolean; +}): Promise { + // NOTE: kept async for API stability / potential future async template loading. + const monitoringUser = params.monitoringUser || DEFAULT_MONITORING_USER; + const database = params.database; + + const qRole = quoteIdent(monitoringUser); + const qDb = quoteIdent(database); + const qPw = quoteLiteral(params.monitoringPassword); + const qRoleNameLit = quoteLiteral(monitoringUser); + + const steps: InitStep[] = []; + + const vars: Record = { + ROLE_IDENT: qRole, + DB_IDENT: qDb, + }; + + // Role creation/update is done in one template file. + // Always use a single DO block to avoid race conditions between "role exists?" checks and CREATE USER. + // We: + // - create role if missing (and handle duplicate_object in case another session created it concurrently), + // - then ALTER ROLE to ensure the password is set to the desired value. + const roleStmt = `do $$ begin + if not exists (select 1 from pg_catalog.pg_roles where rolname = ${qRoleNameLit}) then + begin + create user ${qRole} with password ${qPw}; + exception when duplicate_object then + null; + end; + end if; + alter user ${qRole} with password ${qPw}; +end $$;`; + + const roleSql = applyTemplate(loadSqlTemplate("01.role.sql"), { ...vars, ROLE_STMT: roleStmt }); + steps.push({ name: "01.role", sql: roleSql }); + + steps.push({ + name: "02.permissions", + sql: applyTemplate(loadSqlTemplate("02.permissions.sql"), vars), + }); + + if (params.includeOptionalPermissions) { + steps.push( + { + name: "03.optional_rds", + sql: applyTemplate(loadSqlTemplate("03.optional_rds.sql"), vars), + optional: true, + }, + { + name: "04.optional_self_managed", + sql: applyTemplate(loadSqlTemplate("04.optional_self_managed.sql"), vars), + optional: true, + } + ); + } + + return { monitoringUser, database, steps }; +} + +export async function applyInitPlan(params: { + client: PgClient; + plan: InitPlan; + verbose?: boolean; +}): Promise<{ applied: string[]; skippedOptional: string[] }> { + const applied: string[] = []; + const skippedOptional: string[] = []; + + // Apply non-optional steps in a single transaction. + await params.client.query("begin;"); + try { + for (const step of params.plan.steps.filter((s) => !s.optional)) { + try { + await params.client.query(step.sql, step.params as any); + applied.push(step.name); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + const errAny = e as any; + const wrapped: any = new Error(`Failed at step "${step.name}": ${msg}`); + // Preserve useful Postgres error fields so callers can provide better hints / diagnostics. + const pgErrorFields = [ + "code", + "detail", + "hint", + "position", + "internalPosition", + "internalQuery", + "where", + "schema", + "table", + "column", + "dataType", + "constraint", + "file", + "line", + "routine", + ] as const; + if (errAny && typeof errAny === "object") { + for (const field of pgErrorFields) { + if (errAny[field] !== undefined) wrapped[field] = errAny[field]; + } + } + if (e instanceof Error && e.stack) { + wrapped.stack = e.stack; + } + throw wrapped; + } + } + await params.client.query("commit;"); + } catch (e) { + // Rollback errors should never mask the original failure. + try { + await params.client.query("rollback;"); + } catch { + // ignore + } + throw e; + } + + // Apply optional steps outside of the transaction so a failure doesn't abort everything. + for (const step of params.plan.steps.filter((s) => s.optional)) { + try { + // Run each optional step in its own mini-transaction to avoid partial application. + await params.client.query("begin;"); + try { + await params.client.query(step.sql, step.params as any); + await params.client.query("commit;"); + applied.push(step.name); + } catch { + try { + await params.client.query("rollback;"); + } catch { + // ignore rollback errors + } + skippedOptional.push(step.name); + // best-effort: ignore + } + } catch { + // If we can't even begin/commit, treat as skipped. + skippedOptional.push(step.name); + } + } + + return { applied, skippedOptional }; +} + +export type VerifyInitResult = { + ok: boolean; + missingRequired: string[]; + missingOptional: string[]; +}; + +export async function verifyInitSetup(params: { + client: PgClient; + database: string; + monitoringUser: string; + includeOptionalPermissions: boolean; +}): Promise { + // Use a repeatable-read snapshot so all checks see a consistent view. + await params.client.query("begin isolation level repeatable read;"); + try { + const missingRequired: string[] = []; + const missingOptional: string[] = []; + + const role = params.monitoringUser; + const db = params.database; + + const roleRes = await params.client.query("select 1 from pg_catalog.pg_roles where rolname = $1", [role]); + const roleExists = (roleRes.rowCount ?? 0) > 0; + if (!roleExists) { + missingRequired.push(`role "${role}" does not exist`); + // If role is missing, other checks will error or be meaningless. + return { ok: false, missingRequired, missingOptional }; + } + + const connectRes = await params.client.query( + "select has_database_privilege($1, $2, 'CONNECT') as ok", + [role, db] + ); + if (!connectRes.rows?.[0]?.ok) { + missingRequired.push(`CONNECT on database "${db}"`); + } + + const pgMonitorRes = await params.client.query( + "select pg_has_role($1, 'pg_monitor', 'member') as ok", + [role] + ); + if (!pgMonitorRes.rows?.[0]?.ok) { + missingRequired.push("membership in role pg_monitor"); + } + + const pgIndexRes = await params.client.query( + "select has_table_privilege($1, 'pg_catalog.pg_index', 'SELECT') as ok", + [role] + ); + if (!pgIndexRes.rows?.[0]?.ok) { + missingRequired.push("SELECT on pg_catalog.pg_index"); + } + + const viewExistsRes = await params.client.query("select to_regclass('public.pg_statistic') is not null as ok"); + if (!viewExistsRes.rows?.[0]?.ok) { + missingRequired.push("view public.pg_statistic exists"); + } else { + const viewPrivRes = await params.client.query( + "select has_table_privilege($1, 'public.pg_statistic', 'SELECT') as ok", + [role] + ); + if (!viewPrivRes.rows?.[0]?.ok) { + missingRequired.push("SELECT on view public.pg_statistic"); + } + } + + const schemaUsageRes = await params.client.query( + "select has_schema_privilege($1, 'public', 'USAGE') as ok", + [role] + ); + if (!schemaUsageRes.rows?.[0]?.ok) { + missingRequired.push("USAGE on schema public"); + } + + const rolcfgRes = await params.client.query("select rolconfig from pg_catalog.pg_roles where rolname = $1", [role]); + const rolconfig = rolcfgRes.rows?.[0]?.rolconfig; + const spLine = Array.isArray(rolconfig) ? rolconfig.find((v: any) => String(v).startsWith("search_path=")) : undefined; + if (typeof spLine !== "string" || !spLine) { + missingRequired.push("role search_path is set"); + } else { + // We accept any ordering as long as public and pg_catalog are included. + const sp = spLine.toLowerCase(); + if (!sp.includes("public") || !sp.includes("pg_catalog")) { + missingRequired.push("role search_path includes public and pg_catalog"); + } + } + + if (params.includeOptionalPermissions) { + // Optional RDS/Aurora extras + { + const extRes = await params.client.query("select 1 from pg_extension where extname = 'rds_tools'"); + if ((extRes.rowCount ?? 0) === 0) { + missingOptional.push("extension rds_tools"); + } else { + const fnRes = await params.client.query( + "select has_function_privilege($1, 'rds_tools.pg_ls_multixactdir()', 'EXECUTE') as ok", + [role] + ); + if (!fnRes.rows?.[0]?.ok) { + missingOptional.push("EXECUTE on rds_tools.pg_ls_multixactdir()"); + } + } + } + + // Optional self-managed extras + const optionalFns = [ + "pg_catalog.pg_stat_file(text)", + "pg_catalog.pg_stat_file(text, boolean)", + "pg_catalog.pg_ls_dir(text)", + "pg_catalog.pg_ls_dir(text, boolean, boolean)", + ]; + for (const fn of optionalFns) { + const fnRes = await params.client.query("select has_function_privilege($1, $2, 'EXECUTE') as ok", [role, fn]); + if (!fnRes.rows?.[0]?.ok) { + missingOptional.push(`EXECUTE on ${fn}`); + } + } + } + + return { ok: missingRequired.length === 0, missingRequired, missingOptional }; + } finally { + // Read-only: rollback to release snapshot; do not mask original errors. + try { + await params.client.query("rollback;"); + } catch { + // ignore + } + } +} + + diff --git a/cli/lib/issues.ts b/cli/lib/issues.ts index 4231778..356885c 100644 --- a/cli/lib/issues.ts +++ b/cli/lib/issues.ts @@ -2,13 +2,62 @@ import * as https from "https"; import { URL } from "url"; import { maskSecret, normalizeBaseUrl } from "./util"; +export interface IssueActionItem { + id: string; + issue_id: string; + title: string; + description: string | null; + severity: number; + is_done: boolean; + done_by: number | null; + done_at: string | null; + created_at: string; + updated_at: string; +} + +export interface Issue { + id: string; + title: string; + description: string | null; + created_at: string; + updated_at: string; + status: number; + url_main: string | null; + urls_extra: string[] | null; + data: unknown | null; + author_id: number; + org_id: number; + project_id: number | null; + is_ai_generated: boolean; + assigned_to: number[] | null; + labels: string[] | null; + is_edited: boolean; + author_display_name: string; + comment_count: number; + action_items: IssueActionItem[]; +} + +export interface IssueComment { + id: string; + issue_id: string; + author_id: number; + parent_comment_id: string | null; + content: string; + created_at: string; + updated_at: string; + data: unknown | null; +} + +export type IssueListItem = Pick; + +export type IssueDetail = Pick; export interface FetchIssuesParams { apiKey: string; apiBaseUrl: string; debug?: boolean; } -export async function fetchIssues(params: FetchIssuesParams): Promise { +export async function fetchIssues(params: FetchIssuesParams): Promise { const { apiKey, apiBaseUrl, debug } = params; if (!apiKey) { throw new Error("API key is required"); @@ -16,6 +65,7 @@ export async function fetchIssues(params: FetchIssuesParams): Promise { const base = normalizeBaseUrl(apiBaseUrl); const url = new URL(`${base}/issues`); + url.searchParams.set("select", "id,title,status,created_at"); const headers: Record = { "access-token": apiKey, @@ -54,10 +104,10 @@ export async function fetchIssues(params: FetchIssuesParams): Promise { } if (res.statusCode && res.statusCode >= 200 && res.statusCode < 300) { try { - const parsed = JSON.parse(data); + const parsed = JSON.parse(data) as IssueListItem[]; resolve(parsed); } catch { - resolve(data); + reject(new Error(`Failed to parse issues response: ${data}`)); } } else { let errMsg = `Failed to fetch issues: HTTP ${res.statusCode}`; @@ -88,7 +138,7 @@ export interface FetchIssueCommentsParams { debug?: boolean; } -export async function fetchIssueComments(params: FetchIssueCommentsParams): Promise { +export async function fetchIssueComments(params: FetchIssueCommentsParams): Promise { const { apiKey, apiBaseUrl, issueId, debug } = params; if (!apiKey) { throw new Error("API key is required"); @@ -137,10 +187,10 @@ export async function fetchIssueComments(params: FetchIssueCommentsParams): Prom } if (res.statusCode && res.statusCode >= 200 && res.statusCode < 300) { try { - const parsed = JSON.parse(data); + const parsed = JSON.parse(data) as IssueComment[]; resolve(parsed); } catch { - resolve(data); + reject(new Error(`Failed to parse issue comments response: ${data}`)); } } else { let errMsg = `Failed to fetch issue comments: HTTP ${res.statusCode}`; @@ -170,7 +220,7 @@ export interface FetchIssueParams { debug?: boolean; } -export async function fetchIssue(params: FetchIssueParams): Promise { +export async function fetchIssue(params: FetchIssueParams): Promise { const { apiKey, apiBaseUrl, issueId, debug } = params; if (!apiKey) { throw new Error("API key is required"); @@ -181,6 +231,7 @@ export async function fetchIssue(params: FetchIssueParams): Promise { const base = normalizeBaseUrl(apiBaseUrl); const url = new URL(`${base}/issues`); + url.searchParams.set("select", "id,title,description,status,created_at,author_display_name"); url.searchParams.set("id", `eq.${issueId}`); url.searchParams.set("limit", "1"); @@ -223,12 +274,12 @@ export async function fetchIssue(params: FetchIssueParams): Promise { try { const parsed = JSON.parse(data); if (Array.isArray(parsed)) { - resolve(parsed[0] ?? null); + resolve((parsed[0] as IssueDetail) ?? null); } else { - resolve(parsed); + resolve(parsed as IssueDetail); } } catch { - resolve(data); + reject(new Error(`Failed to parse issue response: ${data}`)); } } else { let errMsg = `Failed to fetch issue: HTTP ${res.statusCode}`; @@ -260,7 +311,7 @@ export interface CreateIssueCommentParams { debug?: boolean; } -export async function createIssueComment(params: CreateIssueCommentParams): Promise { +export async function createIssueComment(params: CreateIssueCommentParams): Promise { const { apiKey, apiBaseUrl, issueId, content, parentCommentId, debug } = params; if (!apiKey) { throw new Error("API key is required"); @@ -324,10 +375,10 @@ export async function createIssueComment(params: CreateIssueCommentParams): Prom } if (res.statusCode && res.statusCode >= 200 && res.statusCode < 300) { try { - const parsed = JSON.parse(data); + const parsed = JSON.parse(data) as IssueComment; resolve(parsed); } catch { - resolve(data); + reject(new Error(`Failed to parse create comment response: ${data}`)); } } else { let errMsg = `Failed to create issue comment: HTTP ${res.statusCode}`; diff --git a/cli/lib/mcp-server.ts b/cli/lib/mcp-server.ts index ede1f17..5532532 100644 --- a/cli/lib/mcp-server.ts +++ b/cli/lib/mcp-server.ts @@ -109,16 +109,8 @@ export async function startMcpServer(rootOpts?: RootOptsLike, extra?: { debug?: try { if (toolName === "list_issues") { - const result = await fetchIssues({ apiKey, apiBaseUrl, debug }); - const trimmed = Array.isArray(result) - ? (result as any[]).map((r) => ({ - id: (r as any).id, - title: (r as any).title, - status: (r as any).status, - created_at: (r as any).created_at, - })) - : result; - return { content: [{ type: "text", text: JSON.stringify(trimmed, null, 2) }] }; + const issues = await fetchIssues({ apiKey, apiBaseUrl, debug }); + return { content: [{ type: "text", text: JSON.stringify(issues, null, 2) }] }; } if (toolName === "view_issue") { diff --git a/cli/package-lock.json b/cli/package-lock.json index 7466c86..c998fea 100644 --- a/cli/package-lock.json +++ b/cli/package-lock.json @@ -1,12 +1,12 @@ { "name": "postgresai", - "version": "0.12.0-beta.6", + "version": "0.0.0-dev.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "postgresai", - "version": "0.12.0-beta.6", + "version": "0.0.0-dev.0", "license": "Apache-2.0", "dependencies": { "@modelcontextprotocol/sdk": "^1.20.2", diff --git a/cli/package.json b/cli/package.json index 261555a..835b703 100644 --- a/cli/package.json +++ b/cli/package.json @@ -1,6 +1,6 @@ { "name": "postgresai", - "version": "0.12.0-beta.6", + "version": "0.0.0-dev.0", "description": "postgres_ai CLI (Node.js)", "license": "Apache-2.0", "private": false, @@ -25,7 +25,8 @@ "build": "tsc", "prepare": "npm run build", "start": "node ./dist/bin/postgres-ai.js --help", - "dev": "tsc --watch" + "dev": "tsc --watch", + "test": "npm run build && node --test test/*.test.cjs" }, "dependencies": { "@modelcontextprotocol/sdk": "^1.20.2", diff --git a/cli/sql/01.role.sql b/cli/sql/01.role.sql new file mode 100644 index 0000000..9b99910 --- /dev/null +++ b/cli/sql/01.role.sql @@ -0,0 +1,16 @@ +-- Role creation / password update (template-filled by cli/lib/init.ts) +-- +-- Always uses a race-safe pattern (create if missing, then always alter to set the password): +-- do $$ begin +-- if not exists (select 1 from pg_catalog.pg_roles where rolname = '...') then +-- begin +-- create user "..." with password '...'; +-- exception when duplicate_object then +-- null; +-- end; +-- end if; +-- alter user "..." with password '...'; +-- end $$; +{{ROLE_STMT}} + + diff --git a/cli/sql/02.permissions.sql b/cli/sql/02.permissions.sql new file mode 100644 index 0000000..7e760c1 --- /dev/null +++ b/cli/sql/02.permissions.sql @@ -0,0 +1,33 @@ +-- Required permissions for postgres_ai monitoring user (template-filled by cli/lib/init.ts) + +-- Allow connect +grant connect on database {{DB_IDENT}} to {{ROLE_IDENT}}; + +-- Standard monitoring privileges +grant pg_monitor to {{ROLE_IDENT}}; +grant select on pg_catalog.pg_index to {{ROLE_IDENT}}; + +-- Optional, for bloat analysis: expose pg_statistic via a view +create or replace view public.pg_statistic as +select + n.nspname as schemaname, + c.relname as tablename, + a.attname, + s.stanullfrac as null_frac, + s.stawidth as avg_width, + false as inherited +from pg_catalog.pg_statistic s +join pg_catalog.pg_class c on c.oid = s.starelid +join pg_catalog.pg_namespace n on n.oid = c.relnamespace +join pg_catalog.pg_attribute a on a.attrelid = s.starelid and a.attnum = s.staattnum +where a.attnum > 0 and not a.attisdropped; + +grant select on public.pg_statistic to {{ROLE_IDENT}}; + +-- Hardened clusters sometimes revoke PUBLIC on schema public +grant usage on schema public to {{ROLE_IDENT}}; + +-- Keep search_path predictable +alter user {{ROLE_IDENT}} set search_path = "$user", public, pg_catalog; + + diff --git a/cli/sql/03.optional_rds.sql b/cli/sql/03.optional_rds.sql new file mode 100644 index 0000000..44036dd --- /dev/null +++ b/cli/sql/03.optional_rds.sql @@ -0,0 +1,6 @@ +-- Optional permissions for RDS Postgres / Aurora (best effort) + +create extension if not exists rds_tools; +grant execute on function rds_tools.pg_ls_multixactdir() to {{ROLE_IDENT}}; + + diff --git a/cli/sql/04.optional_self_managed.sql b/cli/sql/04.optional_self_managed.sql new file mode 100644 index 0000000..590c3b3 --- /dev/null +++ b/cli/sql/04.optional_self_managed.sql @@ -0,0 +1,8 @@ +-- Optional permissions for self-managed Postgres (best effort) + +grant execute on function pg_catalog.pg_stat_file(text) to {{ROLE_IDENT}}; +grant execute on function pg_catalog.pg_stat_file(text, boolean) to {{ROLE_IDENT}}; +grant execute on function pg_catalog.pg_ls_dir(text) to {{ROLE_IDENT}}; +grant execute on function pg_catalog.pg_ls_dir(text, boolean, boolean) to {{ROLE_IDENT}}; + + diff --git a/cli/test/init.integration.test.cjs b/cli/test/init.integration.test.cjs new file mode 100644 index 0000000..d465fcc --- /dev/null +++ b/cli/test/init.integration.test.cjs @@ -0,0 +1,382 @@ +const test = require("node:test"); +const assert = require("node:assert/strict"); +const fs = require("node:fs"); +const os = require("node:os"); +const path = require("node:path"); +const net = require("node:net"); +const { spawn, spawnSync } = require("node:child_process"); + +function sqlLiteral(value) { + return `'${String(value).replace(/'/g, "''")}'`; +} + +function findOnPath(cmd) { + const which = spawnSync("sh", ["-lc", `command -v ${cmd}`], { encoding: "utf8" }); + if (which.status === 0) return String(which.stdout || "").trim(); + return null; +} + +function findPgBin(cmd) { + const p = findOnPath(cmd); + if (p) return p; + + // Debian/Ubuntu (GitLab CI node:*-bullseye images): binaries usually live here. + // We avoid filesystem globbing in JS and just ask the shell. + const probe = spawnSync( + "sh", + [ + "-lc", + `ls -1 /usr/lib/postgresql/*/bin/${cmd} 2>/dev/null | head -n 1 || true`, + ], + { encoding: "utf8" } + ); + const out = String(probe.stdout || "").trim(); + if (out) return out; + + return null; +} + +function havePostgresBinaries() { + return !!(findPgBin("initdb") && findPgBin("postgres")); +} + +async function getFreePort() { + return await new Promise((resolve, reject) => { + const srv = net.createServer(); + srv.listen(0, "127.0.0.1", () => { + const addr = srv.address(); + srv.close((err) => { + if (err) return reject(err); + resolve(addr.port); + }); + }); + srv.on("error", reject); + }); +} + +async function waitFor(fn, { timeoutMs = 10000, intervalMs = 100 } = {}) { + const start = Date.now(); + // eslint-disable-next-line no-constant-condition + while (true) { + try { + return await fn(); + } catch (e) { + if (Date.now() - start > timeoutMs) throw e; + await new Promise((r) => setTimeout(r, intervalMs)); + } + } +} + +async function withTempPostgres(t) { + const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), "postgresai-init-")); + const dataDir = path.join(tmpRoot, "data"); + const socketDir = path.join(tmpRoot, "sock"); + fs.mkdirSync(socketDir, { recursive: true }); + + const initdb = findPgBin("initdb"); + const postgresBin = findPgBin("postgres"); + assert.ok(initdb && postgresBin, "PostgreSQL binaries not found (need initdb and postgres)"); + + const init = spawnSync(initdb, ["-D", dataDir, "-U", "postgres", "-A", "trust"], { + encoding: "utf8", + }); + assert.equal(init.status, 0, init.stderr || init.stdout); + + // Configure: local socket trust, TCP scram. + const hbaPath = path.join(dataDir, "pg_hba.conf"); + fs.appendFileSync( + hbaPath, + "\n# Added by postgresai init integration tests\nlocal all all trust\nhost all all 127.0.0.1/32 scram-sha-256\nhost all all ::1/128 scram-sha-256\n", + "utf8" + ); + + const port = await getFreePort(); + + let postgresProc; + try { + postgresProc = spawn( + postgresBin, + ["-D", dataDir, "-k", socketDir, "-h", "127.0.0.1", "-p", String(port)], + { + stdio: ["ignore", "pipe", "pipe"], + } + ); + + // Register cleanup immediately so failures below don't leave a running postgres and hang CI. + t.after(async () => { + postgresProc.kill("SIGTERM"); + try { + await waitFor( + async () => { + if (postgresProc.exitCode === null) throw new Error("still running"); + }, + { timeoutMs: 5000, intervalMs: 100 } + ); + } catch { + postgresProc.kill("SIGKILL"); + } + fs.rmSync(tmpRoot, { recursive: true, force: true }); + }); + } catch (e) { + // If anything goes wrong before cleanup is registered, ensure we don't leak a running postgres. + try { + if (postgresProc) postgresProc.kill("SIGKILL"); + } catch { + // ignore + } + fs.rmSync(tmpRoot, { recursive: true, force: true }); + throw e; + } + + const { Client } = require("pg"); + + const connectLocal = async (database = "postgres") => { + // IMPORTANT: must match the port Postgres is started with; otherwise pg defaults to 5432 and the socket path won't exist. + const c = new Client({ host: socketDir, port, user: "postgres", database }); + await c.connect(); + return c; + }; + + await waitFor(async () => { + const c = await connectLocal(); + await c.end(); + }); + + const postgresPassword = "postgrespw"; + { + const c = await connectLocal(); + await c.query(`alter user postgres password ${sqlLiteral(postgresPassword)};`); + await c.query("create database testdb"); + await c.end(); + } + + const adminUri = `postgresql://postgres:${postgresPassword}@127.0.0.1:${port}/testdb`; + return { port, socketDir, adminUri, postgresPassword }; +} + +async function runCliInit(args, env = {}) { + const node = process.execPath; + const cliPath = path.resolve(__dirname, "..", "dist", "bin", "postgres-ai.js"); + const res = spawnSync(node, [cliPath, "init", ...args], { + encoding: "utf8", + env: { ...process.env, ...env }, + }); + return res; +} + +test( + "integration: init supports URI / conninfo / psql-like connection styles", + { skip: !havePostgresBinaries() }, + async (t) => { + const pg = await withTempPostgres(t); + + // 1) positional URI + { + const r = await runCliInit([pg.adminUri, "--password", "monpw", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + } + + // 2) conninfo + { + const conninfo = `dbname=testdb host=127.0.0.1 port=${pg.port} user=postgres password=${pg.postgresPassword}`; + const r = await runCliInit([conninfo, "--password", "monpw2", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + } + + // 3) psql-like options (+ PGPASSWORD) + { + const r = await runCliInit( + [ + "-h", + "127.0.0.1", + "-p", + String(pg.port), + "-U", + "postgres", + "-d", + "testdb", + "--password", + "monpw3", + "--skip-optional-permissions", + ], + { PGPASSWORD: pg.postgresPassword } + ); + assert.equal(r.status, 0, r.stderr || r.stdout); + } + } +); + +test( + "integration: init requires explicit monitoring password in non-interactive mode (unless --print-password)", + { skip: !havePostgresBinaries() }, + async (t) => { + const pg = await withTempPostgres(t); + + // spawnSync captures stdout/stderr (non-TTY). We should not print a generated password unless explicitly requested. + { + const r = await runCliInit([pg.adminUri, "--skip-optional-permissions"]); + assert.notEqual(r.status, 0); + assert.match(r.stderr, /not printed in non-interactive mode/i); + assert.match(r.stderr, /--print-password/); + } + + // With explicit opt-in, it should succeed (and will print the generated password). + { + const r = await runCliInit([pg.adminUri, "--print-password", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + assert.match(r.stderr, /Generated monitoring password for postgres_ai_mon/i); + assert.match(r.stderr, /PGAI_MON_PASSWORD=/); + } + } +); + +test( + "integration: init fixes slightly-off permissions idempotently", + { skip: !havePostgresBinaries() }, + async (t) => { + const pg = await withTempPostgres(t); + const { Client } = require("pg"); + + // Create monitoring role with wrong password, no grants. + { + const c = new Client({ connectionString: pg.adminUri }); + await c.connect(); + await c.query( + "do $$ begin if not exists (select 1 from pg_roles where rolname='postgres_ai_mon') then create role postgres_ai_mon login password 'wrong'; end if; end $$;" + ); + await c.end(); + } + + // Run init (should grant everything). + { + const r = await runCliInit([pg.adminUri, "--password", "correctpw", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + } + + // Verify privileges. + { + const c = new Client({ connectionString: pg.adminUri }); + await c.connect(); + const dbOk = await c.query( + "select has_database_privilege('postgres_ai_mon', current_database(), 'CONNECT') as ok" + ); + assert.equal(dbOk.rows[0].ok, true); + const roleOk = await c.query("select pg_has_role('postgres_ai_mon', 'pg_monitor', 'member') as ok"); + assert.equal(roleOk.rows[0].ok, true); + const idxOk = await c.query( + "select has_table_privilege('postgres_ai_mon', 'pg_catalog.pg_index', 'SELECT') as ok" + ); + assert.equal(idxOk.rows[0].ok, true); + const viewOk = await c.query( + "select has_table_privilege('postgres_ai_mon', 'public.pg_statistic', 'SELECT') as ok" + ); + assert.equal(viewOk.rows[0].ok, true); + const sp = await c.query("select rolconfig from pg_roles where rolname='postgres_ai_mon'"); + assert.ok(Array.isArray(sp.rows[0].rolconfig)); + assert.ok(sp.rows[0].rolconfig.some((v) => String(v).includes("search_path="))); + await c.end(); + } + + // Run init again (idempotent). + { + const r = await runCliInit([pg.adminUri, "--password", "correctpw", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + } + } +); + +test("integration: init reports nicely when lacking permissions", { skip: !havePostgresBinaries() }, async (t) => { + const pg = await withTempPostgres(t); + const { Client } = require("pg"); + + // Create limited user that can connect but cannot create roles / grant. + const limitedPw = "limitedpw"; + { + const c = new Client({ connectionString: pg.adminUri }); + await c.connect(); + await c.query(`do $$ begin + if not exists (select 1 from pg_roles where rolname='limited') then + begin + create role limited login password ${sqlLiteral(limitedPw)}; + exception when duplicate_object then + null; + end; + end if; + end $$;`); + await c.query("grant connect on database testdb to limited"); + await c.end(); + } + + const limitedUri = `postgresql://limited:${limitedPw}@127.0.0.1:${pg.port}/testdb`; + const r = await runCliInit([limitedUri, "--password", "monpw", "--skip-optional-permissions"]); + assert.notEqual(r.status, 0); + assert.match(r.stderr, /Error: init:/); + // Should include step context and hint. + assert.match(r.stderr, /Failed at step "/); + assert.match(r.stderr, /Fix: connect as a superuser/i); +}); + +test("integration: init --verify returns 0 when ok and non-zero when missing", { skip: !havePostgresBinaries() }, async (t) => { + const pg = await withTempPostgres(t); + const { Client } = require("pg"); + + // Prepare: run init + { + const r = await runCliInit([pg.adminUri, "--password", "monpw", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + } + + // Verify should pass + { + const r = await runCliInit([pg.adminUri, "--verify", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + assert.match(r.stdout, /init verify: OK/i); + } + + // Break a required privilege and ensure verify fails + { + const c = new Client({ connectionString: pg.adminUri }); + await c.connect(); + // pg_catalog tables are often readable via PUBLIC by default; revoke from PUBLIC too so the failure is deterministic. + await c.query("revoke select on pg_catalog.pg_index from public"); + await c.query("revoke select on pg_catalog.pg_index from postgres_ai_mon"); + await c.end(); + } + { + const r = await runCliInit([pg.adminUri, "--verify", "--skip-optional-permissions"]); + assert.notEqual(r.status, 0); + assert.match(r.stderr, /init verify failed/i); + assert.match(r.stderr, /pg_catalog\.pg_index/i); + } +}); + +test("integration: init --reset-password updates the monitoring role login password", { skip: !havePostgresBinaries() }, async (t) => { + const pg = await withTempPostgres(t); + const { Client } = require("pg"); + + // Initial setup with password pw1 + { + const r = await runCliInit([pg.adminUri, "--password", "pw1", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + } + + // Reset to pw2 + { + const r = await runCliInit([pg.adminUri, "--reset-password", "--password", "pw2", "--skip-optional-permissions"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + assert.match(r.stdout, /password reset/i); + } + + // Connect as monitoring user with new password should work + { + const c = new Client({ + connectionString: `postgresql://postgres_ai_mon:pw2@127.0.0.1:${pg.port}/testdb`, + }); + await c.connect(); + const ok = await c.query("select 1 as ok"); + assert.equal(ok.rows[0].ok, 1); + await c.end(); + } +}); + + diff --git a/cli/test/init.test.cjs b/cli/test/init.test.cjs new file mode 100644 index 0000000..f93354b --- /dev/null +++ b/cli/test/init.test.cjs @@ -0,0 +1,323 @@ +const test = require("node:test"); +const assert = require("node:assert/strict"); + +// These tests intentionally import the compiled JS output. +// Run via: npm --prefix cli test +const init = require("../dist/lib/init.js"); +const DEFAULT_MONITORING_USER = init.DEFAULT_MONITORING_USER; + +function runCli(args, env = {}) { + const { spawnSync } = require("node:child_process"); + const path = require("node:path"); + const node = process.execPath; + const cliPath = path.resolve(__dirname, "..", "dist", "bin", "postgres-ai.js"); + return spawnSync(node, [cliPath, ...args], { + encoding: "utf8", + env: { ...process.env, ...env }, + }); +} + +function runPgai(args, env = {}) { + const { spawnSync } = require("node:child_process"); + const path = require("node:path"); + const node = process.execPath; + const pgaiPath = path.resolve(__dirname, "..", "..", "pgai", "bin", "pgai.js"); + return spawnSync(node, [pgaiPath, ...args], { + encoding: "utf8", + env: { ...process.env, ...env }, + }); +} + +test("maskConnectionString hides password when present", () => { + const masked = init.maskConnectionString("postgresql://user:secret@localhost:5432/mydb"); + assert.match(masked, /postgresql:\/\/user:\*{5}@localhost:5432\/mydb/); + assert.doesNotMatch(masked, /secret/); +}); + +test("parseLibpqConninfo parses basic host/dbname/user/port/password", () => { + const cfg = init.parseLibpqConninfo("dbname=mydb host=localhost user=alice port=5432 password=secret"); + assert.equal(cfg.database, "mydb"); + assert.equal(cfg.host, "localhost"); + assert.equal(cfg.user, "alice"); + assert.equal(cfg.port, 5432); + assert.equal(cfg.password, "secret"); +}); + +test("parseLibpqConninfo supports quoted values", () => { + const cfg = init.parseLibpqConninfo("dbname='my db' host='local host'"); + assert.equal(cfg.database, "my db"); + assert.equal(cfg.host, "local host"); +}); + +test("buildInitPlan includes a race-safe role DO block", async () => { + const plan = await init.buildInitPlan({ + database: "mydb", + monitoringUser: DEFAULT_MONITORING_USER, + monitoringPassword: "pw", + includeOptionalPermissions: false, + }); + + assert.equal(plan.database, "mydb"); + const roleStep = plan.steps.find((s) => s.name === "01.role"); + assert.ok(roleStep); + assert.match(roleStep.sql, /do\s+\$\$/i); + assert.match(roleStep.sql, /create\s+user/i); + assert.match(roleStep.sql, /alter\s+user/i); + assert.ok(!plan.steps.some((s) => s.optional)); +}); + +test("buildInitPlan handles special characters in monitoring user and database identifiers", async () => { + const monitoringUser = 'user "with" quotes βœ“'; + const database = 'db name "with" quotes βœ“'; + const plan = await init.buildInitPlan({ + database, + monitoringUser, + monitoringPassword: "pw", + includeOptionalPermissions: false, + }); + + const roleStep = plan.steps.find((s) => s.name === "01.role"); + assert.ok(roleStep); + // Double quotes inside identifiers must be doubled. + assert.match(roleStep.sql, /create\s+user\s+"user ""with"" quotes βœ“"/i); + assert.match(roleStep.sql, /alter\s+user\s+"user ""with"" quotes βœ“"/i); + + const permStep = plan.steps.find((s) => s.name === "02.permissions"); + assert.ok(permStep); + assert.match(permStep.sql, /grant connect on database "db name ""with"" quotes βœ“" to "user ""with"" quotes βœ“"/i); +}); + +test("buildInitPlan keeps backslashes in passwords (no unintended escaping)", async () => { + const pw = String.raw`pw\with\backslash`; + const plan = await init.buildInitPlan({ + database: "mydb", + monitoringUser: DEFAULT_MONITORING_USER, + monitoringPassword: pw, + includeOptionalPermissions: false, + }); + const roleStep = plan.steps.find((s) => s.name === "01.role"); + assert.ok(roleStep); + assert.ok(roleStep.sql.includes(`password '${pw}'`)); +}); + +test("buildInitPlan rejects identifiers with null bytes", async () => { + await assert.rejects( + () => + init.buildInitPlan({ + database: "mydb", + monitoringUser: "bad\0user", + monitoringPassword: "pw", + includeOptionalPermissions: false, + }), + /Identifier cannot contain null bytes/ + ); +}); + +test("buildInitPlan rejects literals with null bytes", async () => { + await assert.rejects( + () => + init.buildInitPlan({ + database: "mydb", + monitoringUser: DEFAULT_MONITORING_USER, + monitoringPassword: "pw\0bad", + includeOptionalPermissions: false, + }), + /Literal cannot contain null bytes/ + ); +}); + +test("buildInitPlan inlines password safely for CREATE/ALTER ROLE grammar", async () => { + const plan = await init.buildInitPlan({ + database: "mydb", + monitoringUser: DEFAULT_MONITORING_USER, + monitoringPassword: "pa'ss", + includeOptionalPermissions: false, + }); + const step = plan.steps.find((s) => s.name === "01.role"); + assert.ok(step); + assert.match(step.sql, /password 'pa''ss'/); + assert.equal(step.params, undefined); +}); + +test("buildInitPlan includes optional steps when enabled", async () => { + const plan = await init.buildInitPlan({ + database: "mydb", + monitoringUser: DEFAULT_MONITORING_USER, + monitoringPassword: "pw", + includeOptionalPermissions: true, + }); + assert.ok(plan.steps.some((s) => s.optional)); +}); + +test("resolveAdminConnection accepts positional URI", () => { + const r = init.resolveAdminConnection({ conn: "postgresql://u:p@h:5432/d" }); + assert.ok(r.clientConfig.connectionString); + assert.doesNotMatch(r.display, /:p@/); +}); + +test("resolveAdminConnection accepts positional conninfo", () => { + const r = init.resolveAdminConnection({ conn: "dbname=mydb host=localhost user=alice" }); + assert.equal(r.clientConfig.database, "mydb"); + assert.equal(r.clientConfig.host, "localhost"); + assert.equal(r.clientConfig.user, "alice"); +}); + +test("resolveAdminConnection rejects invalid psql-like port", () => { + assert.throws( + () => init.resolveAdminConnection({ host: "localhost", port: "abc", username: "u", dbname: "d" }), + /Invalid port value/ + ); +}); + +test("resolveAdminConnection rejects when only PGPASSWORD is provided (no connection details)", () => { + assert.throws(() => init.resolveAdminConnection({ envPassword: "pw" }), /Connection is required/); +}); + +test("resolveAdminConnection error message includes examples", () => { + assert.throws(() => init.resolveAdminConnection({}), /Examples:/); +}); + +test("cli: init with missing connection prints init help/options", () => { + const r = runCli(["init"]); + assert.notEqual(r.status, 0); + // We should show options, not just the error message. + assert.match(r.stderr, /--print-sql/); + assert.match(r.stderr, /--monitoring-user/); +}); + +test("resolveMonitoringPassword auto-generates a strong, URL-safe password by default", async () => { + const r = await init.resolveMonitoringPassword({ monitoringUser: DEFAULT_MONITORING_USER }); + assert.equal(r.generated, true); + assert.ok(typeof r.password === "string" && r.password.length >= 30); + assert.match(r.password, /^[A-Za-z0-9_-]+$/); +}); + +test("applyInitPlan preserves Postgres error fields on step failures", async () => { + const plan = { + monitoringUser: DEFAULT_MONITORING_USER, + database: "mydb", + steps: [{ name: "01.role", sql: "select 1" }], + }; + + const pgErr = Object.assign(new Error("permission denied to create role"), { + code: "42501", + detail: "some detail", + hint: "some hint", + schema: "pg_catalog", + table: "pg_roles", + constraint: "some_constraint", + routine: "aclcheck_error", + }); + + const calls = []; + const client = { + query: async (sql) => { + calls.push(sql); + if (sql === "begin;") return { rowCount: 1 }; + if (sql === "rollback;") return { rowCount: 1 }; + if (sql === "select 1") throw pgErr; + throw new Error(`unexpected sql: ${sql}`); + }, + }; + + await assert.rejects( + () => init.applyInitPlan({ client, plan }), + (e) => { + assert.ok(e instanceof Error); + assert.match(e.message, /Failed at step "01\.role":/); + assert.equal(e.code, "42501"); + assert.equal(e.detail, "some detail"); + assert.equal(e.hint, "some hint"); + assert.equal(e.schema, "pg_catalog"); + assert.equal(e.table, "pg_roles"); + assert.equal(e.constraint, "some_constraint"); + assert.equal(e.routine, "aclcheck_error"); + return true; + } + ); + + assert.deepEqual(calls, ["begin;", "select 1", "rollback;"]); +}); + +test("verifyInitSetup runs inside a repeatable read snapshot and rolls back", async () => { + const calls = []; + const client = { + query: async (sql, params) => { + calls.push(String(sql)); + + if (String(sql).toLowerCase().startsWith("begin isolation level repeatable read")) { + return { rowCount: 1, rows: [] }; + } + if (String(sql).toLowerCase() === "rollback;") { + return { rowCount: 1, rows: [] }; + } + if (String(sql).includes("select rolconfig")) { + return { rowCount: 1, rows: [{ rolconfig: ['search_path="$user", public, pg_catalog'] }] }; + } + if (String(sql).includes("from pg_catalog.pg_roles")) { + return { rowCount: 1, rows: [] }; + } + if (String(sql).includes("has_database_privilege")) { + return { rowCount: 1, rows: [{ ok: true }] }; + } + if (String(sql).includes("pg_has_role")) { + return { rowCount: 1, rows: [{ ok: true }] }; + } + if (String(sql).includes("has_table_privilege") && String(sql).includes("pg_catalog.pg_index")) { + return { rowCount: 1, rows: [{ ok: true }] }; + } + if (String(sql).includes("to_regclass('public.pg_statistic')")) { + return { rowCount: 1, rows: [{ ok: true }] }; + } + if (String(sql).includes("has_table_privilege") && String(sql).includes("public.pg_statistic")) { + return { rowCount: 1, rows: [{ ok: true }] }; + } + if (String(sql).includes("has_schema_privilege")) { + return { rowCount: 1, rows: [{ ok: true }] }; + } + + throw new Error(`unexpected sql: ${sql} params=${JSON.stringify(params)}`); + }, + }; + + const r = await init.verifyInitSetup({ + client, + database: "mydb", + monitoringUser: DEFAULT_MONITORING_USER, + includeOptionalPermissions: false, + }); + assert.equal(r.ok, true); + assert.equal(r.missingRequired.length, 0); + + assert.ok(calls.length > 2); + assert.match(calls[0].toLowerCase(), /^begin isolation level repeatable read/); + assert.equal(calls[calls.length - 1].toLowerCase(), "rollback;"); +}); + +test("redactPasswordsInSql redacts password literals with embedded quotes", async () => { + const plan = await init.buildInitPlan({ + database: "mydb", + monitoringUser: DEFAULT_MONITORING_USER, + monitoringPassword: "pa'ss", + includeOptionalPermissions: false, + }); + const step = plan.steps.find((s) => s.name === "01.role"); + assert.ok(step); + const redacted = init.redactPasswordsInSql(step.sql); + assert.match(redacted, /password ''/i); +}); + +test("cli: init --print-sql works without connection (offline mode)", () => { + const r = runCli(["init", "--print-sql", "-d", "mydb", "--password", "monpw"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + assert.match(r.stdout, /SQL plan \(offline; not connected\)/); + assert.match(r.stdout, new RegExp(`grant connect on database "mydb" to "${DEFAULT_MONITORING_USER}"`, "i")); +}); + +test("pgai wrapper forwards to postgresai CLI", () => { + const r = runPgai(["--help"]); + assert.equal(r.status, 0, r.stderr || r.stdout); + assert.match(r.stdout, /postgresai|PostgresAI/i); +}); + + diff --git a/config/grafana/dashboards/Dashboard_10_Index health.json b/config/grafana/dashboards/Dashboard_10_Index health.json index b5bf58f..f2f0566 100644 --- a/config/grafana/dashboards/Dashboard_10_Index health.json +++ b/config/grafana/dashboards/Dashboard_10_Index health.json @@ -166,7 +166,7 @@ "root_selector": "", "source": "url", "type": "csv", - "url": "http://flask-pgss-api:5000/btree_bloat/csv", + "url": "http://flask-pgss-api:8000/btree_bloat/csv", "url_options": { "data": "", "method": "GET", @@ -691,13 +691,13 @@ "text": "default", "value": "default" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -709,13 +709,13 @@ "text": "postgres_ai", "value": "postgres_ai" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -727,13 +727,13 @@ "text": "workloaddb", "value": "workloaddb" }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/config/grafana/dashboards/Dashboard_12_SLRU.json b/config/grafana/dashboards/Dashboard_12_SLRU.json index d546619..3f97a99 100644 --- a/config/grafana/dashboards/Dashboard_12_SLRU.json +++ b/config/grafana/dashboards/Dashboard_12_SLRU.json @@ -756,14 +756,14 @@ "text": "default", "value": "default" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "description": "", "label": "Cluster", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -775,13 +775,13 @@ "text": "postgres_ai", "value": "postgres_ai" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -793,13 +793,13 @@ "text": "workloaddb", "value": "workloaddb" }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -814,7 +814,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "X - SLRU cache stats", + "title": "12. SLRU cache stats", "uid": "slru_stats", "version": 2 } \ No newline at end of file diff --git a/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/config/grafana/dashboards/Dashboard_13_Lock_waits.json new file mode 100644 index 0000000..6831826 --- /dev/null +++ b/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -0,0 +1,1114 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "Blocking overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Number of active lock conflicts (blockerβ†’blocked pairs).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "count(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"}) or vector(0)", + "legendFormat": "Lock conflicts", + "range": true, + "refId": "A" + } + ], + "title": "Lock conflicts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "How long sessions have been waiting for locks.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "avg(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"})", + "legendFormat": "Average wait time", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "max(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"})", + "legendFormat": "Max wait time", + "range": true, + "refId": "B" + } + ], + "title": "Wait duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Distribution of blocking events by lock type.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "count by (blocked_locktype) (pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"})", + "legendFormat": "{{blocked_locktype}}", + "range": true, + "refId": "A" + } + ], + "title": "By lock type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "How long blocking transactions have been running.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5000 + }, + { + "color": "red", + "value": 30000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "avg(pgwatch_lock_waits_blocker_tx_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"})", + "legendFormat": "Avg blocker age", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "max(pgwatch_lock_waits_blocker_tx_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"})", + "legendFormat": "Max blocker age", + "range": true, + "refId": "B" + } + ], + "title": "Blocker age", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Which tables have the most lock contention.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "count by (blocked_table) (pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\", blocked_table!=\"\"})", + "legendFormat": "{{blocked_table}}", + "range": true, + "refId": "A" + } + ], + "title": "By table", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 11, + "panels": [], + "title": "Blocking tree", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Current blocking relationships with process details.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "blocked_pid" + }, + "properties": [ + { + "id": "custom.width", + "value": 90 + }, + { + "id": "displayName", + "value": "Blocked PID" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_pid" + }, + "properties": [ + { + "id": "custom.width", + "value": 90 + }, + { + "id": "displayName", + "value": "Blocker PID" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.width", + "value": 160 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "unit", + "value": "ms" + }, + { + "id": "displayName", + "value": "Blocked ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_user" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "displayName", + "value": "Blocked User" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_user" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "displayName", + "value": "Blocker User" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_appname" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Blocked App" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_appname" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Blocker App" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_locktype" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocked Lock Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_locktype" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocker Lock Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_mode" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocked Mode" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_mode" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocker Mode" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_table" + }, + "properties": [ + { + "id": "custom.width", + "value": 130 + }, + { + "id": "displayName", + "value": "Blocked Table" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_table" + }, + "properties": [ + { + "id": "custom.width", + "value": 130 + }, + { + "id": "displayName", + "value": "Blocker Table" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "datname" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Database" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_query_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + }, + { + "id": "displayName", + "value": "Blocked Query ID" + }, + { + "id": "links", + "value": [ + { + "title": "View query analysis", + "url": "/d/db52944d-b025-4e18-b70b-89c0af3e7e41/03-single-queryid-analysis?var-cluster_name=${cluster_name}&var-node_name=${node_name}&var-db_name=${db_name}&var-query_id=${__value.raw}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_query_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + }, + { + "id": "displayName", + "value": "Blocker Query ID" + }, + { + "id": "links", + "value": [ + { + "title": "View query analysis", + "url": "/d/db52944d-b025-4e18-b70b-89c0af3e7e41/03-single-queryid-analysis?var-cluster_name=${cluster_name}&var-node_name=${node_name}&var-db_name=${db_name}&var-query_id=${__value.raw}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 16, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Blocked ms" + } + ] + }, + "pluginVersion": "10.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "exemplar": false, + "expr": "pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Blocking tree", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "cluster": true, + "dbname": true, + "env": true, + "instance": true, + "job": true, + "node_name": true, + "real_dbname": true, + "sink_type": true, + "sys_id": true + }, + "indexByName": { + "Time": 0, + "blocked_pid": 1, + "blocker_pid": 2, + "Value": 3, + "blocked_user": 4, + "blocker_user": 5, + "blocked_appname": 6, + "blocker_appname": 7, + "blocked_locktype": 8, + "blocker_locktype": 9, + "blocked_mode": 10, + "blocker_mode": 11, + "blocked_table": 12, + "blocker_table": 13, + "blocked_query_id": 14, + "blocker_query_id": 15, + "datname": 16 + }, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "node_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "hide": 0, + "includeAll": true, + "allValue": ".*", + "multi": false, + "name": "db_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m" + ] + }, + "timezone": "", + "title": "13. Lock contention", + "uid": "lock-contention", + "version": 1, + "weekStart": "" +} diff --git a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json index 0fcf134..034e768 100644 --- a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json +++ b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json @@ -3100,13 +3100,13 @@ "text": "default", "value": "default" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3118,13 +3118,13 @@ "text": "postgres_ai", "value": "postgres_ai" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3136,13 +3136,13 @@ "text": "workloaddb", "value": "workloaddb" }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json b/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json index 812f189..cb1a041 100644 --- a/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json +++ b/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json @@ -364,7 +364,7 @@ "root_selector": "", "source": "url", "type": "csv", - "url": "http://flask-pgss-api:5000/pgss_metrics/csv", + "url": "http://flask-pgss-api:8000/pgss_metrics/csv", "url_options": { "data": "", "method": "GET", @@ -506,7 +506,8 @@ "value": 80 } ] - } + }, + "unit": "calls/s" }, "overrides": [ { @@ -560,14 +561,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by calls (pg_stat_statements)", + "title": "Top $top_n statements by calls per second (pg_stat_statements)", "type": "timeseries" }, { @@ -679,14 +680,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_exec_time_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_exec_time_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by execution time (pg_stat_statements)", + "title": "Top $top_n statements by execution time per second (pg_stat_statements)", "type": "timeseries" }, { @@ -800,7 +801,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_exec_time_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_exec_time_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -919,14 +920,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_plan_time_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_plan_time_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by planning time (pg_stat_statements)", + "title": "Top $top_n statements by planning time per second (pg_stat_statements)", "type": "timeseries" }, { @@ -1040,7 +1041,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_plan_time_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_plan_time_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -1106,7 +1107,8 @@ "value": 80 } ] - } + }, + "unit": "rows/s" }, "overrides": [ { @@ -1160,14 +1162,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_rows{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_rows{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by rows (pg_stat_statements)", + "title": "Top $top_n statements by rows per second (pg_stat_statements)", "type": "timeseries" }, { @@ -1280,7 +1282,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_rows{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_rows{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -1399,14 +1401,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_hit_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_hit_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by shared_blks_hit (in bytes) (pg_stat_statements)", + "title": "Top $top_n statements by shared_blks_hit per second (in bytes) (pg_stat_statements)", "type": "timeseries" }, { @@ -1520,7 +1522,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_hit_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_hit_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -1639,14 +1641,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_read_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_read_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by shared_blks_read (in bytes) (pg_stat_statements)", + "title": "Top $top_n statements by shared_blks_read per second (in bytes) (pg_stat_statements)", "type": "timeseries" }, { @@ -1760,7 +1762,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_read_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_read_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -1879,14 +1881,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_written_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_written_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by shared_blks_written (in bytes) (pg_stat_statements)", + "title": "Top $top_n statements by shared_blks_written per second (in bytes) (pg_stat_statements)", "type": "timeseries" }, { @@ -2000,7 +2002,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_written_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_written_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -2119,14 +2121,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_dirtied_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_dirtied_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by shared_blks_written (in bytes) (pg_stat_statements)", + "title": "Top $top_n statements by shared_blks_dirtied per second (in bytes) (pg_stat_statements)", "type": "timeseries" }, { @@ -2240,7 +2242,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_dirtied_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_shared_bytes_dirtied_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -2359,14 +2361,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_bytes{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_bytes{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by WAL bytes (pg_stat_statements)", + "title": "Top $top_n statements by WAL bytes per second (pg_stat_statements)", "type": "timeseries" }, { @@ -2478,7 +2480,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_bytes{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_bytes{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -2597,14 +2599,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_fpi{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_fpi{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by WAL fpi (in bytes) (pg_stat_statements)", + "title": "Top $top_n statements by WAL fpi per second (in bytes) (pg_stat_statements)", "type": "timeseries" }, { @@ -2715,7 +2717,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_fpi{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_wal_fpi{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -2834,14 +2836,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_read{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_read{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by temp bytes read (pg_stat_statements)", + "title": "Top $top_n statements by temp bytes read per second (pg_stat_statements)", "type": "timeseries" }, { @@ -2953,7 +2955,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_read{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_read{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -3072,14 +3074,14 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_written{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_written{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, "refId": "A" } ], - "title": "Top $top_n statements by temp bytes written (pg_stat_statements)", + "title": "Top $top_n statements by temp bytes written per second (pg_stat_statements)", "type": "timeseries" }, { @@ -3191,7 +3193,7 @@ "targets": [ { "editorMode": "code", - "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_written{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))", + "expr": "topk($top_n, irate(pgwatch_pg_stat_statements_temp_bytes_written{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval])/irate(pgwatch_pg_stat_statements_calls{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}[$__rate_interval]))", "interval": "20", "legendFormat": "{{queryid}}", "range": true, @@ -3263,13 +3265,13 @@ "text": "local", "value": "local" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3281,13 +3283,13 @@ "text": "node-01", "value": "node-01" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3295,17 +3297,20 @@ "type": "query" }, { + "allowCustomValue": false, "current": { - "text": "target_database", - "value": "target_database" + "text": "All", + "value": ["$__all"] }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "includeAll": true, "label": "DB name", + "multi": true, "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json b/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json index 4dc2fab..1f6836b 100644 --- a/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json +++ b/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json @@ -120,9 +120,9 @@ "axisPlacement": "auto", "barAlignment": 0, "axisSoftMin": 0, - "barWidthFactor": 0.5, + "barWidthFactor": 1, "drawStyle": "bars", - "fillOpacity": 40, + "fillOpacity": 100, "gradientMode": "none", "hideFrom": { "legend": false, @@ -168,7 +168,7 @@ { "matcher": { "id": "byRegexp", - "options": "Timeout" + "options": "Timeout.*" }, "properties": [ { @@ -198,7 +198,7 @@ { "matcher": { "id": "byRegexp", - "options": "Lock" + "options": "Lock.*" }, "properties": [ { @@ -213,7 +213,7 @@ { "matcher": { "id": "byRegexp", - "options": "LWLock" + "options": "LWLock.*" }, "properties": [ { @@ -228,7 +228,7 @@ { "matcher": { "id": "byRegexp", - "options": "IO" + "options": "IO.*" }, "properties": [ { @@ -243,7 +243,7 @@ { "matcher": { "id": "byRegexp", - "options": "Client" + "options": "Client.*" }, "properties": [ { @@ -254,37 +254,6 @@ } } ] - }, - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "CPU*", - "Lock", - "LWLock", - "IO", - "Timeout", - "BufferPin", - "Extension", - "IPC" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] } ] }, @@ -316,9 +285,9 @@ "targets": [ { "editorMode": "code", - "expr": "sum by (wait_event_type) (pgwatch_wait_events_total{query_id=\"$query_id\"})", + "expr": "sum by (wait_event_type, wait_event) (pgwatch_wait_events_total{query_id=\"$query_id\"})", "hide": false, - "legendFormat": "__auto", + "legendFormat": "{{wait_event_type}}:{{wait_event}}", "range": true, "refId": "A" }, @@ -2181,13 +2150,13 @@ "text": "local", "value": "local" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2199,13 +2168,13 @@ "text": "node-01", "value": "node-01" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2217,13 +2186,13 @@ "text": "postgres", "value": "postgres" }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json b/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json index 39caa03..269549c 100644 --- a/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json +++ b/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json @@ -278,7 +278,7 @@ "targets": [ { "editorMode": "code", - "expr": "sort_by_label(sum by (wait_event_type) (pgwatch_wait_events_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}), 'wait_event_type')", + "expr": "sort_by_label(sum by (wait_event_type) (pgwatch_wait_events_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\"}), 'wait_event_type')", "hide": false, "legendFormat": "__auto", "range": true, @@ -584,7 +584,7 @@ "targets": [ { "editorMode": "code", - "expr": "sort_by_label(sum by (wait_event_type, wait_event) (pgwatch_wait_events_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\", wait_event_type=~\"$wait_event_type\"}), 'wait_event_type')", + "expr": "sort_by_label(sum by (wait_event_type, wait_event) (pgwatch_wait_events_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\", wait_event_type=~\"$wait_event_type\"}), 'wait_event_type')", "hide": false, "legendFormat": "{{wait_event_type}} - {{wait_event}}", "range": true, @@ -882,7 +882,7 @@ "targets": [ { "editorMode": "code", - "expr": "sort_by_label(sum by (wait_event_type, wait_event, query_id) (pgwatch_wait_events_total{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\", wait_event_type=~\"$wait_event_type\", wait_event=~\"$wait_event\"}), 'wait_event_type', 'query_id')", + "expr": "sort_by_label(sum by (wait_event_type, wait_event, query_id) (pgwatch_wait_events_total{cluster='$cluster_name', node_name='$node_name', datname=~\"$db_name\", wait_event_type=~\"$wait_event_type\", wait_event=~\"$wait_event\"}), 'wait_event_type', 'query_id')", "hide": false, "legendFormat": "{{wait_event_type}} - {{wait_event}} - {{query_id}}", "range": true, @@ -975,13 +975,13 @@ "text": "default", "value": "default" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -993,13 +993,13 @@ "text": "postgres_ai", "value": "postgres_ai" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1007,17 +1007,22 @@ "type": "query" }, { + "allowCustomValue": false, "current": { - "text": "workloaddb", - "value": "workloaddb" + "text": "All", + "value": [ + "$__all" + ] }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "includeAll": true, "label": "DB name", + "multi": true, "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1034,5 +1039,5 @@ "timezone": "browser", "title": "04. Wait event analysis (Active Session History)", "uid": "a222b233-acef-4bac-a451-1591023e4d4f", - "version": 10 -} \ No newline at end of file + "version": 11 +} diff --git a/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json b/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json index 70e1226..e5005a0 100644 --- a/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json +++ b/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json @@ -165,13 +165,13 @@ "text": "default", "value": "default" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -183,13 +183,13 @@ "text": "prod-db", "value": "prod-db" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -201,13 +201,13 @@ "text": "postgres", "value": "postgres" }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/config/grafana/dashboards/Dashboard_8_Table_Stats.json b/config/grafana/dashboards/Dashboard_8_Table_Stats.json index 6c2ef7e..5663097 100644 --- a/config/grafana/dashboards/Dashboard_8_Table_Stats.json +++ b/config/grafana/dashboards/Dashboard_8_Table_Stats.json @@ -414,7 +414,7 @@ "root_selector": "", "source": "url", "type": "csv", - "url": "http://flask-pgss-api:5000/table_info/csv", + "url": "http://flask-pgss-api:8000/table_info/csv", "url_options": { "data": "", "method": "GET", @@ -3250,13 +3250,13 @@ "text": "default", "value": "default" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3268,13 +3268,13 @@ "text": "postgres_ai", "value": "postgres_ai" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3286,13 +3286,13 @@ "text": "workloaddb", "value": "workloaddb" }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/config/grafana/dashboards/Dashboard_9_Single_table_analysis.json b/config/grafana/dashboards/Dashboard_9_Single_table_analysis.json index a7364ec..1e1b8e3 100644 --- a/config/grafana/dashboards/Dashboard_9_Single_table_analysis.json +++ b/config/grafana/dashboards/Dashboard_9_Single_table_analysis.json @@ -1951,13 +1951,13 @@ "text": "default", "value": "default" }, - "definition": "label_values(pgwatch_settings_configured,cluster)", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "label": "Cluster name", "name": "cluster_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured,cluster)", + "query": "label_values(pgwatch_db_size_size_b,cluster)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1969,13 +1969,13 @@ "text": "postgres_ai", "value": "postgres_ai" }, - "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "label": "Node name", "name": "node_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1987,13 +1987,13 @@ "text": "workloaddb", "value": "workloaddb" }, - "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", "name": "db_name", "options": [], "query": { "qryType": 1, - "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/config/grafana/provisioning/grafana.ini b/config/grafana/provisioning/grafana.ini index 1438d05..ecbb7cc 100644 --- a/config/grafana/provisioning/grafana.ini +++ b/config/grafana/provisioning/grafana.ini @@ -1,2 +1,12 @@ [users] home_page = /d/f90500a0-a12e-4081-a2f0-07ed96f27915/1-postgres-node-performance-overview-high-level/ + +[auth] +# When OAuth is enabled, optionally disable the basic login form +disable_login_form = false + +[auth.generic_oauth] +# OAuth is disabled by default; enable via GF_AUTH_GENERIC_OAUTH_ENABLED env var +enabled = false +name = PostgresAI +allow_sign_up = true diff --git a/config/pgwatch-postgres/metrics.yml b/config/pgwatch-postgres/metrics.yml index a9569ca..cecda73 100644 --- a/config/pgwatch-postgres/metrics.yml +++ b/config/pgwatch-postgres/metrics.yml @@ -8,9 +8,10 @@ metrics: queryid, query from pg_stat_statements - where queryid is not null + where + queryid is not null + and dbid = (select oid from pg_database where datname = current_database()) order by total_exec_time desc - limit 1000; gauges: - '*' @@ -148,7 +149,8 @@ metrics: index_id, schema_name, table_name, - index_name + index_name, + index_size_bytes from ( select redundant_index_id as index_id, @@ -195,5 +197,5 @@ presets: full: description: "Full metrics for PostgreSQL storage" metrics: - pgss_queryid_queries: 300 + pgss_queryid_queries: 30 index_definitions: 3600 \ No newline at end of file diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 9cc0ebb..13d5116 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -127,7 +127,6 @@ metrics: blk_read_time, blk_write_time, extract(epoch from (now() - pg_postmaster_start_time()))::int8 as postmaster_uptime_s, - extract(epoch from (now() - pg_backup_start_time()))::int8 as backup_duration_s, case when pg_is_in_recovery() then 1 else 0 end as in_recovery_int, system_identifier::text as tag_sys_id, (select count(*) from pg_index i @@ -282,7 +281,7 @@ metrics: It returns the lock mode and the count of locks for each mode. This metric helps administrators monitor lock contention and performance. sqls: 13: |- - WITH q_locks as ( + with q_locks as ( /* pgwatch_generated */ select * from @@ -389,7 +388,7 @@ metrics: sqls: 11: |- with recursive - q_root_part as ( + q_root_part as ( /* pgwatch_generated */ select c.oid, c.relkind, n.nspname root_schema, @@ -524,8 +523,8 @@ metrics: ) x order by table_size_b desc nulls last limit 300 16: |- - with recursive /* pgwatch_generated */ - q_root_part as ( + with recursive + q_root_part as ( /* pgwatch_generated */ select c.oid, c.relkind, n.nspname root_schema, @@ -712,24 +711,24 @@ metrics: providing insights into potential bottlenecks and resource contention issues. sqls: 11: |- - select datname as tag_datname, coalesce (wait_event, 'CPU*') as tag_wait_event, coalesce(wait_event_type, 'CPU*') as tag_wait_event_type, count(*) as total + select datname as tag_datname, coalesce (wait_event, 'CPU*') as tag_wait_event, coalesce(wait_event_type, 'CPU*') as tag_wait_event_type, count(*) as total /* pgwatch_generated */ from pg_stat_activity where state = 'active' group by tag_datname, tag_wait_event_type, tag_wait_event union select 'server_process' as tag_datname, coalesce (wait_event, 'CPU*') as tag_wait_event, coalesce(wait_event_type, 'CPU*') as tag_wait_event_type, count(*) as total from pg_stat_activity - where state = 'active' and datname IS null + where state = 'active' and datname is null group by tag_datname, tag_wait_event_type, tag_wait_event 14: |- - select datname as tag_datname, query_id::text as tag_query_id, coalesce (wait_event, 'CPU*') as tag_wait_event, coalesce(wait_event_type, 'CPU*') as tag_wait_event_type, count(*) as total + select datname as tag_datname, query_id::text as tag_query_id, coalesce (wait_event, 'CPU*') as tag_wait_event, coalesce(wait_event_type, 'CPU*') as tag_wait_event_type, count(*) as total /* pgwatch_generated */ from pg_stat_activity where state = 'active' group by tag_datname, tag_query_id, tag_wait_event_type, tag_wait_event union select 'server_process' as tag_datname, null as tag_query_id, coalesce (wait_event, 'CPU*') as tag_wait_event, coalesce(wait_event_type, 'CPU*') as tag_wait_event_type, count(*) as total from pg_stat_activity - where state = 'active' and datname IS null + where state = 'active' and datname is null group by tag_datname, tag_query_id, tag_wait_event_type, tag_wait_event gauges: - total @@ -742,11 +741,11 @@ metrics: pg_database.datname as tag_datname, tmp2.tag_application_name, tmp.tag_state, - COALESCE(count,0) as count, - COALESCE(max_tx_duration,0) as max_tx_duration + coalesce(count,0) as count, + coalesce(max_tx_duration,0) as max_tx_duration from ( - VALUES ('active'), + values ('active'), ('idle'), ('idle in transaction'), ('idle in transaction (aborted)'), @@ -760,7 +759,7 @@ metrics: application_name as tag_application_name, state as tag_state, count(*) as count, - MAX(extract(epoch from now() - xact_start))::float as max_tx_duration + max(extract(epoch from now() - xact_start))::float as max_tx_duration from pg_stat_activity group by datname, tag_application_name, tag_state ) as tmp2 @@ -774,8 +773,8 @@ metrics: pg_archiver: sqls: 11: | - WITH - current_wal_file as ( + with + current_wal_file as ( /* pgwatch_generated */ select case when not pg_is_in_recovery() then pg_walfile_name(pg_current_wal_insert_lsn()) else null end pg_walfile_name ), current_wal as ( @@ -802,7 +801,7 @@ metrics: pg_blocked: sqls: 11: |- - select + select /* pgwatch_generated */ (extract(epoch from now()) * 1e9)::int8 as epoch_ns, current_database() as tag_datname, count(*) as queries, @@ -811,16 +810,83 @@ metrics: else relation::regclass::text end as tag_table from pg_catalog.pg_locks blocked - where not blocked.granted + where + not blocked.granted + and database = (select oid from pg_database where datname = current_database()) group by locktype, relation limit 5000 gauges: - queries statement_timeout_seconds: 15 + lock_waits: + description: > + Retrieves detailed information about lock waits, including blocked and blocking processes with their queries, users, and application names. + It returns blocked and blocker process IDs, lock modes and types, affected tables, queries, and wait/transaction durations. + This metric helps administrators identify and diagnose lock contention issues in detail. + sqls: + 14: |- + with sa_snapshot as ( /* pgwatch_generated */ + select * + from pg_stat_activity + where + datname = current_database() + and pid <> pg_backend_pid() + and state in ('active', 'idle in transaction', 'idle in transaction (aborted)') + ), + pid_tables as ( + select distinct on (pid) pid, relation::regclass::text as table_name + from pg_catalog.pg_locks + where relation is not null + and locktype in ('tuple', 'relation') + and relation::regclass::text not like '%_pkey' + and relation::regclass::text not like '%_idx' + order by pid, locktype + ) + select + blocked.pid as blocked_pid, + current_database() as tag_datname, + blocked_stm.usename::text as tag_blocked_user, + blocked_stm.application_name::text as tag_blocked_appname, + blocked.mode as blocked_mode, + blocked.locktype as blocked_locktype, + coalesce(blocked.relation::regclass::text, blocked_tbl.table_name, '') as tag_blocked_table, + blocked_stm.query_id::text as tag_blocked_query_id, + (extract(epoch from (clock_timestamp() - blocked_stm.state_change)) * 1000)::bigint as blocked_ms, + blocker.pid as blocker_pid, + blocker_stm.usename::text as tag_blocker_user, + blocker_stm.application_name::text as tag_blocker_appname, + blocker.mode as blocker_mode, + blocker.locktype as blocker_locktype, + coalesce(blocker.relation::regclass::text, blocker_tbl.table_name, '') as tag_blocker_table, + blocker_stm.query_id::text as tag_blocker_query_id, + (extract(epoch from (clock_timestamp() - blocker_stm.xact_start)) * 1000)::bigint as blocker_tx_ms + from pg_catalog.pg_locks as blocked + join sa_snapshot as blocked_stm on blocked_stm.pid = blocked.pid + join pg_catalog.pg_locks as blocker on + blocked.pid <> blocker.pid + and blocker.granted + and ( + (blocked.database = blocker.database) + or (blocked.database is null and blocker.database is null) + ) + and ( + blocked.relation = blocker.relation + or blocked.transactionid = blocker.transactionid + ) + join sa_snapshot as blocker_stm on blocker_stm.pid = blocker.pid + left join pid_tables as blocked_tbl on blocked_tbl.pid = blocked.pid + left join pid_tables as blocker_tbl on blocker_tbl.pid = blocker.pid + where not blocked.granted + order by blocked_ms desc + limit 10000 + gauges: + - blocked_ms + - blocker_tx_ms + statement_timeout_seconds: 15 pg_database_wraparound: sqls: 11: | - select + select /* pgwatch_generated */ (extract(epoch from now()) * 1e9)::int8 as epoch_ns, datname as tag_datname, age(d.datfrozenxid) as age_datfrozenxid, @@ -836,19 +902,23 @@ metrics: pg_long_running_transactions: sqls: 11: | - select (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + select (extract(epoch from now()) * 1e9)::int8 as epoch_ns, /* pgwatch_generated */ current_database() as tag_datname, - COUNT(*) as transactions, - COALESCE(MAX(extract(epoch from (clock_timestamp() - xact_start)))::int8, 0) as age_in_seconds + count(*) as transactions, + coalesce(max(extract(epoch from (clock_timestamp() - xact_start)))::int8, 0) as age_in_seconds from pg_catalog.pg_stat_activity - where state is distinct from 'idle' and (now() - xact_start) > '1 minutes'::interval and query not like 'autovacuum:%' + where + state is distinct from 'idle' + and datname = current_database() + and (clock_timestamp() - xact_start) > '1 minutes'::interval + and query not like 'autovacuum:%' gauges: - '*' statement_timeout_seconds: 15 pg_stat_replication: sqls: 11: | - select (extract(epoch from now()) * 1e9)::int8 as epoch_ns, + select (extract(epoch from now()) * 1e9)::int8 as epoch_ns, /* pgwatch_generated */ current_database() as tag_datname, application_name as tag_application_name, (pg_current_wal_lsn() - '0/0') % (2^52)::bigint as current_wal_lsn, @@ -869,7 +939,7 @@ metrics: pg_stat_statements: sqls: 11: | - WITH aggregated_statements as ( + with aggregated_statements as ( /* pgwatch_generated */ select pg_database.datname, pg_stat_statements.queryid, @@ -890,8 +960,8 @@ metrics: sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_read) as temp_bytes_read, sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_written) as temp_bytes_written from pg_stat_statements - join pg_database - on pg_database.oid = pg_stat_statements.dbid + join pg_database on pg_database.oid = pg_stat_statements.dbid + where pg_database.datname = current_database() group by pg_database.datname, pg_stat_statements.queryid ) select @@ -915,7 +985,7 @@ metrics: temp_bytes_written::int8 as temp_bytes_written from aggregated_statements 17: | - with aggregated_statements as ( + with aggregated_statements as ( /* pgwatch_generated */ select pg_database.datname, pg_stat_statements.queryid, @@ -936,8 +1006,8 @@ metrics: sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_read) as temp_bytes_read, sum(current_setting('block_size')::int * pg_stat_statements.temp_blks_written) as temp_bytes_written from pg_stat_statements - join pg_database - on pg_database.oid = pg_stat_statements.dbid + join pg_database on pg_database.oid = pg_stat_statements.dbid + where pg_database.datname = current_database() group by pg_database.datname, pg_stat_statements.queryid ) select @@ -985,7 +1055,7 @@ metrics: as tags for easy filtering and querying. sqls: 11: | - select + select /* pgwatch_generated */ current_database() as tag_datname, n.nspname as tag_schemaname, c.relname as tag_relname, @@ -1009,7 +1079,7 @@ metrics: pg_stat_all_indexes: sqls: 11: | - select schemaname as tag_schemaname, + select schemaname as tag_schemaname, /* pgwatch_generated */ relname as tag_relname, indexrelname as tag_indexrelname, idx_scan, @@ -1026,7 +1096,7 @@ metrics: pg_stat_all_tables: sqls: 11: | - select + select /* pgwatch_generated */ current_database() as tag_datname, schemaname as tag_schemaname, relname as tag_relname, @@ -1040,8 +1110,8 @@ metrics: n_tup_hot_upd, n_live_tup, n_dead_tup, - GREATEST(last_autovacuum, last_vacuum, '1970-01-01Z') as last_vacuum, - GREATEST(last_autoanalyze, last_analyze, '1970-01-01Z') as last_analyze, + greatest(last_autovacuum, last_vacuum, '1970-01-01Z') as last_vacuum, + greatest(last_autoanalyze, last_analyze, '1970-01-01Z') as last_analyze, (vacuum_count + autovacuum_count) as vacuum_count, (analyze_count + autoanalyze_count) as analyze_count from @@ -1067,7 +1137,7 @@ metrics: pg_stat_wal_receiver: sqls: 11: | - select current_database() as tag_datname, + select current_database() as tag_datname, /* pgwatch_generated */ case status when 'stopped' then 0 when 'starting' then 1 when 'streaming' then 2 when 'waiting' then 3 when 'restarting' then 4 when 'stopping' then 5 else -1 end as tag_status, (receive_start_lsn- '0/0') % (2^52)::bigint as receive_start_lsn, receive_start_tli, @@ -1127,16 +1197,19 @@ metrics: pg_stuck_idle_in_transaction: sqls: 11: | - select current_database() as tag_datname, COUNT(*) as queries + select current_database() as tag_datname, count(*) as queries /* pgwatch_generated */ from pg_catalog.pg_stat_activity - where state = 'idle in transaction' and (now() - query_start) > '10 minutes'::interval + where + state = 'idle in transaction' + and datname = current_database() + and (clock_timestamp() - state_change) > '1 minutes'::interval gauges: - queries statement_timeout_seconds: 15 pg_total_relation_size: sqls: 11: | - select current_database() as tag_datname, + select current_database() as tag_datname, /* pgwatch_generated */ relnamespace::regnamespace as tag_schemaname, relname as tag_relname, pg_total_relation_size(oid) bytes @@ -1150,7 +1223,7 @@ metrics: pg_txid: sqls: 11: | - select + select /* pgwatch_generated */ current_database() as tag_datname, case when pg_is_in_recovery() then 'NaN'::float else txid_current() % (2^52)::bigint end as current, case when pg_is_in_recovery() then 'NaN'::float else txid_snapshot_xmin(txid_current_snapshot()) % (2^52)::bigint end as xmin, @@ -1163,7 +1236,7 @@ metrics: pg_xlog_position: sqls: 11: | - select current_database() as tag_datname, + select current_database() as tag_datname, /* pgwatch_generated */ case when pg_is_in_recovery() then (pg_last_wal_replay_lsn() - '0/0') % (2^52)::bigint @@ -1179,7 +1252,7 @@ metrics: This metric helps administrators identify indexes that may need maintenance like VACUUM FULL or index reorganization. sqls: 11: | - select current_database() as tag_datname, nspname as tag_schemaname, tblname as tag_tblname, idxname as tag_idxname, (bs*(relpages)/(1024*1024))::float as real_size_mib, + select current_database() as tag_datname, nspname as tag_schemaname, tblname as tag_tblname, idxname as tag_idxname, (bs*(relpages)/(1024*1024))::float as real_size_mib, /* pgwatch_generated */ (bs*(relpages-est_pages))::float as extra_size, 100 * (relpages-est_pages)::float / relpages as extra_pct, fillfactor, @@ -1217,7 +1290,7 @@ metrics: select n.nspname, i.tblname, i.idxname, i.reltuples, i.relpages, i.idxoid, i.fillfactor, current_setting('block_size')::numeric as bs, case -- MAXALIGN: 4 on 32bits, 8 on 64bits (and mingw32 ?) - when version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' then 8 + when version() ~ 'mingw32' or version() ~ '64-bit|x86_64|ppc64|ia64|amd64' then 8 else 4 end as maxalign, /* per page header, fixed size: 20 for 7.X, 24 for others */ @@ -1235,7 +1308,7 @@ metrics: from ( select ct.relname as tblname, ct.relnamespace, ic.idxname, ic.attpos, ic.indkey, ic.indkey[ic.attpos], ic.reltuples, ic.relpages, ic.tbloid, ic.idxoid, ic.fillfactor, coalesce(a1.attnum, a2.attnum) as attnum, coalesce(a1.attname, a2.attname) as attname, coalesce(a1.atttypid, a2.atttypid) as atttypid, - case when a1.attnum IS null + case when a1.attnum is null then ic.idxname else ct.relname end as attrelname @@ -1275,8 +1348,8 @@ metrics: ) as rows_data_stats ) as rows_hdr_pdg_stats ) as relation_stats - order by real_size_mib desc - limit 5000 + order by is_na = 0 desc, bloat_pct desc + limit 1000 gauges: - real_size_mib - extra_size @@ -1294,7 +1367,7 @@ metrics: This metric helps administrators identify tables that may need maintenance like VACUUM FULL or table reorganization. sqls: 11: | - select current_database() as tag_datname, schemaname as tag_schemaname, tblname as tag_tblname, (bs*tblpages)/(1024*1024)::float as real_size_mib, + select current_database() as tag_datname, schemaname as tag_schemaname, tblname as tag_tblname, (bs*tblpages)/(1024*1024)::float as real_size_mib, /* pgwatch_generated */ (tblpages-est_tblpages)*bs as extra_size, case when tblpages > 0 and tblpages - est_tblpages > 0 then 100 * (tblpages - est_tblpages)/tblpages::float @@ -1331,13 +1404,13 @@ metrics: array_to_string(tbl.reloptions, ' ') from 'fillfactor=([0-9]+)')::smallint, 100) as fillfactor, current_setting('block_size')::numeric as bs, - case when version()~'mingw32' OR version()~'64-bit|x86_64|ppc64|ia64|amd64' then 8 else 4 end as ma, + case when version()~'mingw32' or version()~'64-bit|x86_64|ppc64|ia64|amd64' then 8 else 4 end as ma, 24 as page_hdr, - 23 + case when MAX(coalesce(s.null_frac,0)) > 0 then ( 7 + count(s.attname) ) / 8 else 0::int end + 23 + case when max(coalesce(s.null_frac,0)) > 0 then ( 7 + count(s.attname) ) / 8 else 0::int end + case when bool_or(att.attname = 'oid' and att.attnum < 0) then 4 else 0 end as tpl_hdr_size, sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 0) ) as tpl_data_size, (bool_or(att.atttypid = 'pg_catalog.name'::regtype) - OR sum(case when att.attnum > 0 then 1 else 0 end) <> count(s.attname))::int as is_na + or sum(case when att.attnum > 0 then 1 else 0 end) <> count(s.attname))::int as is_na from pg_attribute as att join pg_class as tbl on att.attrelid = tbl.oid join pg_namespace as ns on ns.oid = tbl.relnamespace @@ -1353,8 +1426,8 @@ metrics: ) as s3 -- where not is_na -- and tblpages*((pst).free_percent + (pst).dead_tuple_percent)::float4/100 >= 1 - order by real_size_mib desc - limit 5000 + order by is_na = 0 desc, bloat_pct desc + limit 1000 gauges: - real_size_mib - extra_size @@ -1372,7 +1445,7 @@ metrics: It helps administrators understand detailed storage breakdown for each table component. sqls: 11: |- - with table_sizes as ( + with table_sizes as ( /* pgwatch_generated */ select (extract(epoch from now()) * 1e9)::int8 as epoch_ns, current_database() as tag_datname, @@ -1402,7 +1475,7 @@ metrics: ) order by pg_total_relation_size(c.oid) desc ) - select /* pgwatch_generated */ + select epoch_ns, tag_datname, tag_schema, @@ -1420,6 +1493,7 @@ metrics: (toast_main_size_b + toast_fsm_size_b + toast_vm_size_b + toast_indexes_size_b) as total_toast_size_b from table_sizes where total_relation_size_b > 0 + limit 1000; gauges: - table_main_size_b - table_fsm_size_b @@ -1442,7 +1516,7 @@ metrics: This metric helps administrators identify and fix invalid indexes to improve database performance. sqls: 11: | - with fk_indexes as ( + with fk_indexes as ( /* pgwatch_generated */ select schemaname as tag_schema_name, (indexrelid::regclass)::text as tag_index_name, @@ -1496,7 +1570,8 @@ metrics: (extract(epoch from now()) * 1e9)::int8 as epoch_ns, current_database() as tag_datname, num_data.* - from num_data; + from num_data + limit 1000; gauges: - '*' statement_timeout_seconds: 15 @@ -1507,7 +1582,7 @@ metrics: and foreign key constraints. Uses the exact logic from tmp.sql with JSON aggregation and proper thresholds. sqls: 11: | - with fk_indexes as ( + with fk_indexes as ( /* pgwatch_generated */ select n.nspname as schema_name, ci.relname as index_name, @@ -1539,10 +1614,10 @@ metrics: ), redundant_indexes as ( select i2.indexrelid as index_id, - tnsp.nspname AS schema_name, - trel.relname AS table_name, + tnsp.nspname as schema_name, + trel.relname as table_name, pg_relation_size(trel.oid) as table_size_bytes, - irel.relname AS index_name, + irel.relname as index_name, am1.amname as access_method, (i1.indexrelid::regclass)::text as reason, i1.indexrelid as reason_index_id, @@ -1553,7 +1628,7 @@ metrics: s.idx_scan as index_usage, quote_ident(tnsp.nspname) as formated_schema_name, coalesce(nullif(quote_ident(tnsp.nspname), 'public') || '.', '') || quote_ident(irel.relname) as formated_index_name, - quote_ident(trel.relname) AS formated_table_name, + quote_ident(trel.relname) as formated_table_name, coalesce(nullif(quote_ident(tnsp.nspname), 'public') || '.', '') || quote_ident(trel.relname) as formated_relation_name, i2.opclasses from ( @@ -1650,7 +1725,8 @@ metrics: supports_fk order by index_size_bytes desc ) - select * from redundant_indexes_grouped; + select * from redundant_indexes_grouped + limit 1000; gauges: - '*' statement_timeout_seconds: 15 @@ -1661,7 +1737,7 @@ metrics: This metric helps administrators identify and fix unused indexes to improve database performance. sqls: 11: | - with fk_indexes as ( + with fk_indexes as ( /* pgwatch_generated */ select n.nspname as schema_name, ci.relname as index_name, @@ -1704,7 +1780,7 @@ metrics: join pg_class ci on ci.oid = i.indexrelid and ci.relkind = 'i' join pg_class cr on cr.oid = i.indrelid and cr.relkind = 'r' join pg_namespace n on n.oid = ci.relnamespace - join pg_am a ON ci.relam = a.oid + join pg_am a on ci.relam = a.oid left join pg_stat_all_indexes as si on si.indexrelid = i.indexrelid where i.indisunique = false @@ -1741,9 +1817,9 @@ metrics: select 'Never Used Indexes' as tag_reason, index_id, - schema_name AS tag_schema_name, - table_name AS tag_table_name, - index_name AS tag_index_name, + schema_name as tag_schema_name, + table_name as tag_table_name, + index_name as tag_index_name, idx_scan, all_scans, index_scan_pct, @@ -1753,13 +1829,14 @@ metrics: table_size_bytes, relpages, idx_is_btree, - opclasses AS tag_opclasses, + opclasses as tag_opclasses, supports_fk from index_ratios where idx_scan = 0 and idx_is_btree - order by index_size_bytes desc; + order by index_size_bytes desc + limit 1000; gauges: - '*' statement_timeout_seconds: 15 @@ -1770,7 +1847,7 @@ metrics: This metric helps administrators identify and fix rarely used indexes to improve database performance. sqls: 11: | - with fk_indexes as ( + with fk_indexes as ( /* pgwatch_generated */ select n.nspname as schema_name, ci.relname as index_name, @@ -1816,7 +1893,7 @@ metrics: join pg_class ci on ci.oid = i.indexrelid and ci.relkind = 'i' join pg_class cr on cr.oid = i.indrelid and cr.relkind = 'r' join pg_namespace n on n.oid = ci.relnamespace - join pg_am a ON ci.relam = a.oid + join pg_am a on ci.relam = a.oid left join pg_stat_all_indexes as si on si.indexrelid = i.indexrelid where i.indisunique = false @@ -1853,9 +1930,9 @@ metrics: select tag_reason, index_id, - schema_name AS tag_schema_name, - table_name AS tag_table_name, - index_name AS tag_index_name, + schema_name as tag_schema_name, + table_name as tag_table_name, + index_name as tag_index_name, idx_scan, all_scans, index_scan_pct, @@ -1865,7 +1942,7 @@ metrics: table_size_bytes, relpages, idx_is_btree, - opclasses AS tag_opclasses, + opclasses as tag_opclasses, supports_fk, grp from ( @@ -1903,7 +1980,8 @@ metrics: and not idx_is_btree and index_size_bytes > 100000000 ) t - order by grp, index_size_bytes desc; + order by grp, index_size_bytes desc + limit 1000; gauges: - '*' statement_timeout_seconds: 15 @@ -1916,7 +1994,7 @@ metrics: not per-index or per-table. sqls: 11: | - select + select /* pgwatch_generated */ datname as tag_database_name, extract(epoch from stats_reset)::int as stats_reset_epoch, extract(epoch from now() - stats_reset)::int as seconds_since_reset @@ -1935,7 +2013,7 @@ metrics: sqls: 11: | -- postgresql wal archiving lag monitor - with wal_info as ( + with wal_info as ( /* pgwatch_generated */ select last_archived_wal, last_archived_time, @@ -1950,7 +2028,7 @@ metrics: from pg_stat_archiver where last_archived_wal is not null ) - select + select pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::bigint as current_lsn_numeric, pg_wal_lsn_diff((log_id_hex || '/' || lpad(to_hex((segment_dec + 1) * wal_segment_size_bytes), 8, '0'))::pg_lsn, '0/0')::bigint as archived_wal_finish_lsn_numeric, @@ -1992,12 +2070,12 @@ metrics: pg_vacuum_progress: sqls: 11: | - select - current_database() AS tag_datname, - N.nspname as tag_schema_name, + select /* pgwatch_generated */ + current_database() as tag_datname, + n.nspname as tag_schema_name, c.relname as tag_table_name, - N.nspname || '.' || c.relname as tag_relname, - s.relid AS tag_relid, + n.nspname || '.' || c.relname as tag_relname, + s.relid as tag_relid, case when A.query ~ '^autovacuum.*(to prevent wraparound)' then 'aggressive_autovacuum' when A.query ~ '^autovacuum' then @@ -2012,22 +2090,22 @@ metrics: 1 when S.phase ~ 'scanning heap' then 2 - WHEN S.phase ~ 'vacuuming indexes' THEN + when s.phase ~ 'vacuuming indexes' then 3 - WHEN S.phase ~ 'vacuuming heap' THEN + when s.phase ~ 'vacuuming heap' then 4 - WHEN S.phase ~ 'cleaning up indexes' THEN + when s.phase ~ 'cleaning up indexes' then 5 - WHEN S.phase ~ 'truncating heap' THEN + when s.phase ~ 'truncating heap' then 6 - WHEN S.phase ~ 'final cleanup' THEN + when s.phase ~ 'final cleanup' then 7 end tag_phase, S.heap_blks_total::float, S.heap_blks_scanned::float, S.heap_blks_vacuumed::float, S.index_vacuum_count::float, - (S.max_dead_tuples / 1024 / 1024)::float as max_dead_tuples_mb, + (S.max_dead_tuples::float / 1024 / 1024) as max_dead_tuples_mb, S.num_dead_tuples::float as num_dead_item_ids from pg_stat_progress_vacuum as S @@ -2035,13 +2113,14 @@ metrics: join pg_stat_activity as A on (S.pid = A.pid) join pg_class C on (C.oid = S.relid) join pg_namespace N on (N.oid = C.relnamespace) + where D.datname = current_database() 17: | - select - current_database() AS tag_datname, - N.nspname as tag_schema_name, + select /* pgwatch_generated */ + current_database() as tag_datname, + n.nspname as tag_schema_name, c.relname as tag_table_name, - N.nspname || '.' || c.relname as tag_relname, - s.relid AS tag_relid, + n.nspname || '.' || c.relname as tag_relname, + s.relid as tag_relid, case when A.query ~ '^autovacuum.*(to prevent wraparound)' then 'aggressive_autovacuum' when A.query ~ '^autovacuum' then @@ -2056,15 +2135,15 @@ metrics: 1 when S.phase ~ 'scanning heap' then 2 - WHEN S.phase ~ 'vacuuming indexes' THEN + when s.phase ~ 'vacuuming indexes' then 3 - WHEN S.phase ~ 'vacuuming heap' THEN + when s.phase ~ 'vacuuming heap' then 4 - WHEN S.phase ~ 'cleaning up indexes' THEN + when s.phase ~ 'cleaning up indexes' then 5 - WHEN S.phase ~ 'truncating heap' THEN + when s.phase ~ 'truncating heap' then 6 - WHEN S.phase ~ 'final cleanup' THEN + when s.phase ~ 'final cleanup' then 7 end tag_phase, S.heap_blks_total::float, @@ -2079,13 +2158,14 @@ metrics: join pg_stat_activity as A on (S.pid = A.pid) join pg_class C on (C.oid = S.relid) join pg_namespace N on (N.oid = C.relnamespace) + where D.datname = current_database() gauges: - '*' statement_timeout_seconds: 15 pg_index_pilot: sqls: 11: | - select + select /* pgwatch_generated */ (extract(epoch from now()) * 1e9)::int8 as epoch_ns, datname as tag_datname, schemaname as tag_schemaname, @@ -2107,7 +2187,7 @@ metrics: pg_index_pilot_config: sqls: 12: | - select + select /* pgwatch_generated */ coalesce(datname, '*') as tag_datname, coalesce(schemaname, '*') as tag_schemaname, coalesce(relname, '*') as tag_relname, @@ -2202,7 +2282,7 @@ metrics: multixact_size: sqls: 11: | - with env as ( + with env as ( /* pgwatch_generated */ select exists ( select @@ -2210,12 +2290,43 @@ metrics: join pg_namespace n on n.oid = p.pronamespace where p.proname = 'pg_ls_multixactdir' and n.nspname = 'rds_tools' ) as has_rds_fn, + exists ( + select + from pg_proc p + join pg_namespace n on n.oid = p.pronamespace + where p.proname = 'aurora_stat_file' and n.nspname = 'pg_catalog' + ) as has_aurora_fn, exists (select from pg_proc where proname = 'pg_ls_dir') as has_pg_ls_dir_func, exists (select from pg_proc where proname = 'pg_stat_file') as has_pg_stat_file_func ), can_local as ( select (has_pg_ls_dir_func and has_pg_stat_file_func) as ok from env ), + -- Use query_to_xml to safely execute Aurora-specific multixact query. + -- Aurora uses aurora_stat_file() function instead of rds_tools.pg_ls_multixactdir(). + aurora_probe_xml as ( + select query_to_xml($q$ + with files as ( + select filename, allocated_bytes, used_bytes + from aurora_stat_file() + where filename like 'pg_multixact/%' + ), + members as ( + select sum(used_bytes)::bigint as sz from files where filename like 'pg_multixact/members%' + ), + offsets as ( + select sum(used_bytes)::bigint as sz from files where filename like 'pg_multixact/offsets%' + ), + has_rows as ( + select exists(select 1 from files) as any_rows + ) + select + case when (select any_rows from has_rows) then coalesce((select sz from members), 0) end as members_bytes, + case when (select any_rows from has_rows) then coalesce((select sz from offsets), 0) end as offsets_bytes, + case when (select any_rows from has_rows) then 0 else 1 end as status_code + $q$, true, true, '') as x + where (select has_aurora_fn from env) + ), -- Use query_to_xml to safely execute RDS-specific multixact directory listing query. -- The XML wrapper allows the query to fail gracefully if rds_tools.pg_ls_multixactdir() -- is unavailable or returns errors, preventing the entire metric from failing. @@ -2239,7 +2350,7 @@ metrics: case when (select any_rows from has_rows) then coalesce((select sz from offsets), 0) end as offsets_bytes, case when (select any_rows from has_rows) then 0 else 1 end as status_code $q$, true, true, '') as x - where (select has_rds_fn from env) + where (select has_rds_fn from env) and not (select has_aurora_fn from env) ), -- Use query_to_xml to safely execute standard Postgres multixact directory listing query. -- The XML wrapper allows the query to fail gracefully if pg_stat_file() or pg_ls_dir() @@ -2269,9 +2380,11 @@ metrics: case when (select has_any from flags) then coalesce((select sz from offsets), 0) end as offsets_bytes, case when (select has_any from flags) then 0 else 1 end as status_code $q$, true, true, '') as x - where not (select has_rds_fn from env) and (select ok from can_local) + where not (select has_rds_fn from env) and not (select has_aurora_fn from env) and (select ok from can_local) ), picked as ( + select * from aurora_probe_xml + union all select * from rds_probe_xml union all select * from local_probe_xml @@ -2324,6 +2437,7 @@ presets: pg_statio_all_tables: 30 pg_statio_all_indexes: 30 pg_total_relation_size: 30 + lock_waits: 30 pg_blocked: 30 pg_long_running_transactions: 30 pg_stuck_idle_in_transaction: 30 diff --git a/docker-compose.yml b/docker-compose.yml index 5b69dda..1f19b24 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -119,6 +119,19 @@ services: GF_SECURITY_ADMIN_USER: monitor GF_SECURITY_ADMIN_PASSWORD: ${GF_SECURITY_ADMIN_PASSWORD:-demo} GF_INSTALL_PLUGINS: yesoreyeram-infinity-datasource + # OAuth configuration (disabled by default, enabled via Ansible) + GF_AUTH_GENERIC_OAUTH_ENABLED: ${GRAFANA_OAUTH_ENABLED:-false} + GF_AUTH_GENERIC_OAUTH_NAME: ${GRAFANA_OAUTH_NAME:-PostgresAI} + GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP: ${GRAFANA_OAUTH_ALLOW_SIGN_UP:-true} + GF_AUTH_GENERIC_OAUTH_CLIENT_ID: ${GRAFANA_OAUTH_CLIENT_ID:-} + GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: ${GRAFANA_OAUTH_CLIENT_SECRET:-} + GF_AUTH_GENERIC_OAUTH_SCOPES: ${GRAFANA_OAUTH_SCOPES:-openid email profile} + GF_AUTH_GENERIC_OAUTH_AUTH_URL: ${GRAFANA_OAUTH_AUTH_URL:-} + GF_AUTH_GENERIC_OAUTH_TOKEN_URL: ${GRAFANA_OAUTH_TOKEN_URL:-} + GF_AUTH_GENERIC_OAUTH_API_URL: ${GRAFANA_OAUTH_API_URL:-} + # Optional: disable login form when OAuth is primary auth + GF_AUTH_DISABLE_LOGIN_FORM: ${GRAFANA_DISABLE_LOGIN_FORM:-false} + GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL:-} ports: - "${GRAFANA_BIND_HOST:-}3000:3000" volumes: @@ -130,10 +143,8 @@ services: - sink-postgres - sink-prometheus restart: unless-stopped - flask-backend: - build: - context: ./flask-backend - dockerfile: Dockerfile + monitoring_flask_backend: + image: postgresai/monitoring_flask_backend:latest container_name: flask-pgss-api environment: - FLASK_ENV=production @@ -143,7 +154,7 @@ services: restart: unless-stopped # PostgreSQL Reports Generator - Runs reports after 1 hour postgres-reports: - image: python:3.11-slim + image: postgresai/reporter:1.0.2 container_name: postgres-reports working_dir: /app volumes: @@ -166,16 +177,13 @@ services: echo 'Starting PostgreSQL reports generation...' && while true; do echo 'Extracting cluster and node name from instances.yml...' && - CLUSTER=$$(python3 -c \"import yaml; data=yaml.safe_load(open('instances.yml')); print(data[0]['custom_tags']['cluster'])\") && - NODE_NAME=$$(python3 -c \"import yaml; data=yaml.safe_load(open('instances.yml')); print(data[0]['custom_tags']['node_name'])\") && - echo \"Using cluster: $$CLUSTER, node: $$NODE_NAME\" && echo 'Generating PostgreSQL reports...' && if [ -f /app/.pgwatch-config ] && grep -q '^api_key=' /app/.pgwatch-config; then API_KEY=$$(grep '^api_key=' /app/.pgwatch-config | cut -d'=' -f2-) && - python postgres_reports.py --prometheus-url http://sink-prometheus:9090 --cluster \"$$CLUSTER\" --node-name \"$$NODE_NAME\" --output /app/all_reports_$$(date +%Y%m%d_%H%M%S).json --token $$API_KEY --project postgres-ai-monitoring + python postgres_reports.py --prometheus-url http://sink-prometheus:9090 --output /app/all_reports_$$(date +%Y%m%d_%H%M%S).json --token $$API_KEY --project postgres-ai-monitoring else echo 'No API key configured, generating reports without upload...' && - python postgres_reports.py --prometheus-url http://sink-prometheus:9090 --cluster \"$$CLUSTER\" --node-name \"$$NODE_NAME\" --output /app/all_reports_$$(date +%Y%m%d_%H%M%S).json --no-upload + python postgres_reports.py --prometheus-url http://sink-prometheus:9090 --output /app/all_reports_$$(date +%Y%m%d_%H%M%S).json --no-upload fi && echo 'Reports generated. Sleeping for 24 hours...' && sleep 86400 @@ -196,18 +204,18 @@ services: - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro command: - - '--housekeeping_interval=30s' - - '--docker_only=true' - - '--disable_metrics=percpu,sched,tcp,udp,hugetlb,referenced_memory,cpu_topology,resctrl' - - '--store_container_labels=false' + - "--housekeeping_interval=30s" + - "--docker_only=true" + - "--disable_metrics=percpu,sched,tcp,udp,hugetlb,referenced_memory,cpu_topology,resctrl" + - "--store_container_labels=false" # Node Exporter - System metrics node-exporter: image: prom/node-exporter:v1.8.2 container_name: node-exporter command: - - '--path.rootfs=/host' - - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + - "--path.rootfs=/host" + - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" volumes: - /:/host:ro,rslave restart: unless-stopped diff --git a/flask-backend/Dockerfile b/monitoring_flask_backend/Dockerfile similarity index 86% rename from flask-backend/Dockerfile rename to monitoring_flask_backend/Dockerfile index ceb6e61..9ba8d26 100644 --- a/flask-backend/Dockerfile +++ b/monitoring_flask_backend/Dockerfile @@ -16,11 +16,11 @@ RUN pip install --no-cache-dir -r requirements.txt COPY app.py . # Expose port -EXPOSE 5000 +EXPOSE 8000 # Set environment variables ENV FLASK_APP=app.py ENV FLASK_ENV=production # Run the application -CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "120", "app:app"] \ No newline at end of file +CMD ["gunicorn", "--bind", "0.0.0.0:8000", "--workers", "4", "--timeout", "120", "app:app"] \ No newline at end of file diff --git a/flask-backend/app.py b/monitoring_flask_backend/app.py similarity index 99% rename from flask-backend/app.py rename to monitoring_flask_backend/app.py index 5e193bd..ba45d9b 100644 --- a/flask-backend/app.py +++ b/monitoring_flask_backend/app.py @@ -4,6 +4,7 @@ import io from datetime import datetime, timezone, timedelta import logging +import os # Configure logging logging.basicConfig(level=logging.INFO) @@ -11,8 +12,8 @@ app = Flask(__name__) -# Prometheus connection -PROMETHEUS_URL = "http://sink-prometheus:9090" +# Prometheus connection - use environment variable with fallback +PROMETHEUS_URL = os.environ.get('PROMETHEUS_URL', 'http://localhost:8428') # Metric name mapping for cleaner CSV output METRIC_NAME_MAPPING = { diff --git a/flask-backend/requirements.txt b/monitoring_flask_backend/requirements.txt similarity index 100% rename from flask-backend/requirements.txt rename to monitoring_flask_backend/requirements.txt diff --git a/pgai/bin/pgai.js b/pgai/bin/pgai.js new file mode 100644 index 0000000..707387b --- /dev/null +++ b/pgai/bin/pgai.js @@ -0,0 +1,48 @@ +#!/usr/bin/env node +"use strict"; + +const { spawn } = require("node:child_process"); +const path = require("node:path"); +const fs = require("node:fs"); + +function die(msg) { + process.stderr.write(`${msg}\n`); + process.exit(1); +} + +let target; +try { + target = require.resolve("postgresai/dist/bin/postgres-ai.js"); +} catch (e) { + // Dev-friendly fallback when running from the monorepo checkout (postgresai lives under ../cli). + const fallback = path.resolve(__dirname, "..", "..", "cli", "dist", "bin", "postgres-ai.js"); + if (fs.existsSync(fallback)) { + target = fallback; + } else { + die( + [ + "pgai: failed to locate postgresai package.", + "", + "This wrapper expects postgresai to be installed as a dependency.", + ].join("\n") + ); + } +} + +const child = spawn(process.execPath, [target, ...process.argv.slice(2)], { + stdio: "inherit", +}); + +child.on("exit", (code, signal) => { + if (signal) { + process.kill(process.pid, signal); + return; + } + process.exit(code ?? 0); +}); + +child.on("error", (err) => { + die(`pgai: failed to run postgresai: ${err instanceof Error ? err.message : String(err)}`); +}); + + diff --git a/pgai/package.json b/pgai/package.json new file mode 100644 index 0000000..40f2330 --- /dev/null +++ b/pgai/package.json @@ -0,0 +1,27 @@ +{ + "name": "pgai", + "version": "0.0.0-dev.0", + "description": "Thin wrapper for postgresai CLI (provides `npx pgai ...`)", + "license": "Apache-2.0", + "private": false, + "repository": { + "type": "git", + "url": "git+https://gitlab.com/postgres-ai/postgres_ai.git" + }, + "homepage": "https://gitlab.com/postgres-ai/postgres_ai", + "bugs": { + "url": "https://gitlab.com/postgres-ai/postgres_ai/-/issues" + }, + "bin": { + "pgai": "./bin/pgai.js" + }, + "type": "commonjs", + "engines": { + "node": ">=18" + }, + "dependencies": { + "postgresai": "0.0.0-dev.0" + } +} + + diff --git a/postgres_ai b/postgres_ai index 53a8941..f0bbbab 100755 --- a/postgres_ai +++ b/postgres_ai @@ -1104,7 +1104,7 @@ start_services() { log_info "Starting Postgres AI monitoring services (production mode)..." log_info "Target demo database not included - add your own PostgreSQL instances to monitor" # Start all services except target-db, including host stats monitoring - $compose_cmd -f "$COMPOSE_FILE" up -d sources-generator sink-postgres sink-prometheus pgwatch-postgres pgwatch-prometheus grafana flask-backend postgres-reports cadvisor node-exporter postgres-exporter + $compose_cmd -f "$COMPOSE_FILE" up -d sources-generator sink-postgres sink-prometheus pgwatch-postgres pgwatch-prometheus grafana monitoring_flask_backend postgres-reports cadvisor node-exporter postgres-exporter fi log_success "Services started!" diff --git a/postgres_ai_helm/.helmignore b/postgres_ai_helm/.helmignore new file mode 100644 index 0000000..517ac5f --- /dev/null +++ b/postgres_ai_helm/.helmignore @@ -0,0 +1,20 @@ +# Patterns to ignore when building packages +.DS_Store +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +*.swp +*.bak +*.tmp +*.orig +*~ +.project +.idea/ +*.tmproj +.vscode/ +*.code-workspace +*.md \ No newline at end of file diff --git a/postgres_ai_helm/Chart.lock b/postgres_ai_helm/Chart.lock new file mode 100644 index 0000000..acc19b0 --- /dev/null +++ b/postgres_ai_helm/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: grafana + repository: https://grafana.github.io/helm-charts + version: 10.1.4 +digest: sha256:0071bd9cc7eb403a2a58b7b0cd92eedbecafda51588ebd6a18c45dbb12416f79 +generated: "2025-11-13T12:20:55.238834+02:00" diff --git a/postgres_ai_helm/Chart.yaml b/postgres_ai_helm/Chart.yaml new file mode 100644 index 0000000..c3a6be7 --- /dev/null +++ b/postgres_ai_helm/Chart.yaml @@ -0,0 +1,25 @@ +apiVersion: v2 +appVersion: 1.0.0 +dependencies: +- condition: grafana.enabled + name: grafana + repository: https://grafana.github.io/helm-charts + version: 10.1.4 +description: PostgresAI monitoring stack with PGWatch, VictoriaMetrics, and Grafana + for Kubernetes +home: https://postgres.ai +keywords: +- postgresql +- monitoring +- pgwatch +- victoriametrics +- grafana +- observability +maintainers: +- name: PostgresAI + url: https://postgres.ai +name: postgres-ai-monitoring +sources: +- https://github.com/PostgresAI/double-pgwatch-poc +type: application +version: 0.12 diff --git a/postgres_ai_helm/INSTALLATION_GUIDE.md b/postgres_ai_helm/INSTALLATION_GUIDE.md new file mode 100644 index 0000000..a84924c --- /dev/null +++ b/postgres_ai_helm/INSTALLATION_GUIDE.md @@ -0,0 +1,178 @@ +# Postgres AI monitoring - Helm chart installation guide + +## Installation + +### 1. Download Helm chart + +### 2. Create namespace + +```bash +kubectl create namespace postgres-ai-mon +``` + +### 3. Create custom-values.yaml + +```yaml +existingSecret: + name: postgres-ai-monitoring-secrets + +global: + clusterName: my-cluster + nodeName: my-node + customTags: + env: production + +monitoredDatabases: + - name: my-db + host: db-host.example.com + port: 5432 + database: postgres + user: postgres_ai_mon + passwordSecretKey: my-db-password + presetMetrics: full + isEnabled: true + group: production + +grafana: + enabled: true + admin: + existingSecret: postgres-ai-monitoring-secrets + userKey: grafana-admin-user + passwordKey: grafana-admin-password + service: + type: ClusterIP + +ingress: + enabled: true + className: nginx + hosts: + grafana: monitoring.example.com + +storage: + postgresSize: 100Gi + victoriaMetricsSize: 200Gi + storageClassName: standard +``` + +**Customize**: `clusterName`, `monitoredDatabases`, `ingress.hosts`, and `storageClassName`. + +### 4. Create secret + +```bash +kubectl create secret generic postgres-ai-monitoring-secrets \ + --namespace postgres-ai-mon \ + --from-literal=postgres-password='SINK_POSTGRES_PASSWORD' \ + --from-literal=grafana-admin-user='monitor' \ + --from-literal=grafana-admin-password='GRAFANA_PASSWORD' \ + --from-literal=pgai-api-key='POSTGRES_AI_API_KEY' \ + --from-literal=db-password-my-db-password='DB_PASSWORD' +``` + +**Notes:** + +- `SINK_POSTGRES_PASSWORD` should be generated by you and will be used to connect to the internal database for storing metrics +- `GRAFANA_PASSWORD` should be generated by you and will be used to access grafana +- `POSTGRES_AI_API_KEY` should be attained from PostgresAI platform and will be used to connect to the PostgresAI platform +- Add `--from-literal` for each database that you want to monitor + - Key must match `passwordSecretKey` in custom-values.yaml + - Key name must be `db-password-` and value must be the password for monitoring user in the database + +### 5. Install helm chart + +```bash +helm install postgres-ai-monitoring ./postgres-ai-monitoring-0.12.tgz \ + --namespace postgres-ai-mon \ + --values custom-values.yaml +``` + +### 6. Verify installation + +```bash +kubectl get pods -n postgres-ai-mon +``` + +## Access grafana + +**Port Forward** (quick access): + +```bash +kubectl port-forward -n postgres-ai-mon svc/postgres-ai-monitoring-grafana 3000:80 +``` + +Open: `http://localhost:3000` + +**Ingress**: Access via configured domain (e.g., `http://monitoring.example.com`) + +**Login**: Username and password from the secret (`grafana-admin-user` / `grafana-admin-password`) + +## Common tasks + +### Update configuration + +```bash +helm upgrade postgres-ai-monitoring ./postgres-ai-monitoring-0.12.tgz \ + --namespace postgres-ai-mon \ + --values custom-values.yaml +``` + +### Add database + +1. Add entry to `monitoredDatabases` in custom-values.yaml + +2. Add password to secret: + +```bash +kubectl create secret generic postgres-ai-monitoring-secrets \ + --namespace postgres-ai-mon \ + --from-literal=new-db-password='password' \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +3. Run `helm upgrade` + +### Check logs + +```bash +kubectl logs -n postgres-ai-mon +``` + +## Uninstall + +### 1. Uninstall Helm release + +```bash +helm uninstall postgres-ai-monitoring --namespace postgres-ai-mon +``` + +This removes all resources created by the Helm chart, but preserves PersistentVolumeClaims and secrets. + +### 2. Delete PersistentVolumeClaims (optional) + +**Warning**: This will permanently delete all stored metrics and Grafana data. + +```bash +kubectl delete pvc -n postgres-ai-mon --all +``` + +Or delete specific PVCs: + +```bash +kubectl delete pvc -n postgres-ai-mon data-postgres-ai-monitoring-sink-postgres-0 +kubectl delete pvc -n postgres-ai-mon data-postgres-ai-monitoring-victoriametrics-0 +``` + +### 3. Delete secrets (optional) + +```bash +kubectl delete secret -n postgres-ai-mon postgres-ai-monitoring-secrets +``` + +### 4. Delete namespace (optional) + +**Warning**: This will delete all resources in the namespace, including any data stored in PersistentVolumes. + +```bash +kubectl delete namespace postgres-ai-mon +``` + +**Note**: Before deleting the namespace, ensure no other applications are using it. diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_10_Index health.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_10_Index health.json new file mode 120000 index 0000000..104db39 --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_10_Index health.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_10_Index health.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_11_Single_index_analysis.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_11_Single_index_analysis.json new file mode 120000 index 0000000..f579c23 --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_11_Single_index_analysis.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_11_Single_index_analysis.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_12_SLRU.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_12_SLRU.json new file mode 120000 index 0000000..5225984 --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_12_SLRU.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_12_SLRU.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_13_Lock_waits.json new file mode 120000 index 0000000..f01f2bd --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_13_Lock_waits.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json new file mode 120000 index 0000000..fb4537c --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_1_Node_performance_overview.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json new file mode 120000 index 0000000..49964f7 --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json new file mode 120000 index 0000000..f823f9a --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_3_Single_query_analysis.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json new file mode 120000 index 0000000..125919a --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_5_Backup_stats.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_5_Backup_stats.json new file mode 120000 index 0000000..b315232 --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_5_Backup_stats.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_5_Backup_stats.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_6_Replication_and_HA.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_6_Replication_and_HA.json new file mode 120000 index 0000000..79a448f --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_6_Replication_and_HA.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_6_Replication_and_HA.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json new file mode 120000 index 0000000..6d79eca --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_8_Table_Stats.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_8_Table_Stats.json new file mode 120000 index 0000000..f7ffca3 --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_8_Table_Stats.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_8_Table_Stats.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_9_Single_table_analysis.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_9_Single_table_analysis.json new file mode 120000 index 0000000..8d378b6 --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_9_Single_table_analysis.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_9_Single_table_analysis.json \ No newline at end of file diff --git a/postgres_ai_helm/config/grafana/dashboards/Self_Monitoring_Dashboard.json b/postgres_ai_helm/config/grafana/dashboards/Self_Monitoring_Dashboard.json new file mode 120000 index 0000000..fdb978a --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Self_Monitoring_Dashboard.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Self_Monitoring_Dashboard.json \ No newline at end of file diff --git a/postgres_ai_helm/config/metrics-postgres.yml b/postgres_ai_helm/config/metrics-postgres.yml new file mode 120000 index 0000000..351f016 --- /dev/null +++ b/postgres_ai_helm/config/metrics-postgres.yml @@ -0,0 +1 @@ +../../config/pgwatch-postgres/metrics.yml \ No newline at end of file diff --git a/postgres_ai_helm/config/metrics-prometheus.yml b/postgres_ai_helm/config/metrics-prometheus.yml new file mode 120000 index 0000000..7a44cad --- /dev/null +++ b/postgres_ai_helm/config/metrics-prometheus.yml @@ -0,0 +1 @@ +../../config/pgwatch-prometheus/metrics.yml \ No newline at end of file diff --git a/postgres_ai_helm/templates/_helpers.tpl b/postgres_ai_helm/templates/_helpers.tpl new file mode 100644 index 0000000..c1b3b97 --- /dev/null +++ b/postgres_ai_helm/templates/_helpers.tpl @@ -0,0 +1,193 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "postgres-ai-monitoring.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +Truncate to 45 chars to leave room for component suffixes (e.g. -victoriametrics) +*/}} +{{- define "postgres-ai-monitoring.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 45 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 45 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 45 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "postgres-ai-monitoring.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "postgres-ai-monitoring.labels" -}} +helm.sh/chart: {{ include "postgres-ai-monitoring.chart" . }} +{{ include "postgres-ai-monitoring.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- with .Values.commonLabels }} +{{ toYaml . }} +{{- end }} +{{- end }} + +{{/* +Build connection string from database configuration +Password is retrieved from secrets using passwordSecretKey +*/}} +{{- define "postgres-ai-monitoring.dbConnStr" -}} +{{- $db := .db -}} +{{- $root := .root -}} +{{- if $db.connStr }} +{{- $db.connStr }} +{{- else }} +{{- $host := $db.host | default "localhost" }} +{{- $port := $db.port | default 5432 }} +{{- $database := $db.database | default "postgres" }} +{{- $user := $db.user | default "postgres" }} +{{- $passwordKey := printf "db-password-%s" $db.passwordSecretKey }} +postgresql://{{ $user }}:$(DB_PASSWORD)@{{ $host }}:{{ $port }}/{{ $database }} +{{- end }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "postgres-ai-monitoring.selectorLabels" -}} +app.kubernetes.io/name: {{ include "postgres-ai-monitoring.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "postgres-ai-monitoring.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "postgres-ai-monitoring.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Namespace +*/}} +{{- define "postgres-ai-monitoring.namespace" -}} +{{- default .Release.Namespace .Values.namespaceOverride }} +{{- end }} + +{{/* +Determine effective cluster name with fallbacks. +*/}} +{{- define "postgres-ai-monitoring.effectiveClusterName" -}} +{{- if .Values.reporter.clusterName }} +{{- .Values.reporter.clusterName }} +{{- else if .Values.global.clusterName }} +{{- .Values.global.clusterName }} +{{- else }} +k8s-cluster +{{- end }} +{{- end }} + +{{/* +Determine effective node name with fallbacks. +*/}} +{{- define "postgres-ai-monitoring.effectiveNodeName" -}} +{{- if .Values.reporter.nodeName }} +{{- .Values.reporter.nodeName }} +{{- else if .Values.global.nodeName }} +{{- .Values.global.nodeName }} +{{- else }} +{{- "" }} +{{- end }} +{{- end }} + +{{/* +Get cluster name for a specific database with fallbacks. +*/}} +{{- define "postgres-ai-monitoring.databaseClusterName" -}} +{{- $db := .db -}} +{{- $root := .root -}} +{{- if $db.clusterName }} +{{- $db.clusterName }} +{{- else if $root.Values.reporter.clusterName }} +{{- $root.Values.reporter.clusterName }} +{{- else if $root.Values.global.clusterName }} +{{- $root.Values.global.clusterName }} +{{- else }} +{{- "k8s-cluster" }} +{{- end }} +{{- end }} + +{{/* +Get node name for a specific database with fallbacks. +*/}} +{{- define "postgres-ai-monitoring.databaseNodeName" -}} +{{- $db := .db -}} +{{- $root := .root -}} +{{- if $db.nodeName }} +{{- $db.nodeName }} +{{- else if $root.Values.reporter.nodeName }} +{{- $root.Values.reporter.nodeName }} +{{- else if $root.Values.global.nodeName }} +{{- $root.Values.global.nodeName }} +{{- else }} +{{- "" }} +{{- end }} +{{- end }} + +{{/* +Get unique cluster/node combinations from monitoredDatabases. +Returns a list of dicts with cluster and nodeName keys. +*/}} +{{- define "postgres-ai-monitoring.uniqueClusterNodeCombinations" -}} +{{- $root := . -}} +{{- $combinations := list -}} +{{- $seen := dict -}} +{{- range $db := .Values.monitoredDatabases }} + {{- $clusterName := include "postgres-ai-monitoring.databaseClusterName" (dict "db" $db "root" $root) | trim -}} + {{- $nodeName := include "postgres-ai-monitoring.databaseNodeName" (dict "db" $db "root" $root) | trim -}} + {{- $key := printf "%s|%s" $clusterName $nodeName -}} + {{- if not (hasKey $seen $key) }} + {{- $_ := set $seen $key true -}} + {{- $combinations = append $combinations (dict "cluster" $clusterName "nodeName" $nodeName) -}} + {{- end }} +{{- end }} +{{- if eq (len $combinations) 0 }} + {{- $clusterName := include "postgres-ai-monitoring.effectiveClusterName" $root | trim -}} + {{- $nodeName := include "postgres-ai-monitoring.effectiveNodeName" $root | trim -}} + {{- $combinations = append $combinations (dict "cluster" $clusterName "nodeName" $nodeName) -}} +{{- end }} +{{- $combinations | toJson }} +{{- end }} + +{{/* +Get the secret name to use. +Returns existingSecret.name if set, otherwise returns the default secret name. +*/}} +{{- define "postgres-ai-monitoring.secretName" -}} +{{- $existingSecretName := "" }} +{{- if .Values.existingSecret }} + {{- $existingSecretName = .Values.existingSecret.name | default "" }} +{{- end }} +{{- if and $existingSecretName (ne $existingSecretName "") }} +{{- $existingSecretName }} +{{- else }} +{{- printf "%s-secrets" (include "postgres-ai-monitoring.fullname" .) }} +{{- end }} +{{- end }} + + diff --git a/postgres_ai_helm/templates/cadvisor-daemonset.yaml b/postgres_ai_helm/templates/cadvisor-daemonset.yaml new file mode 100644 index 0000000..8b96b42 --- /dev/null +++ b/postgres_ai_helm/templates/cadvisor-daemonset.yaml @@ -0,0 +1,82 @@ +{{- if .Values.cadvisor.enabled }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-cadvisor + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: cadvisor +spec: + selector: + matchLabels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: cadvisor + template: + metadata: + labels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: cadvisor + spec: + hostNetwork: true + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: cadvisor + image: {{ .Values.cadvisor.image }} + imagePullPolicy: IfNotPresent + args: + - --housekeeping_interval=30s + - --docker_only=false + - --disable_metrics=percpu,sched,tcp,udp,hugetlb,referenced_memory,cpu_topology,resctrl + - --store_container_labels=false + ports: + - name: http + containerPort: 8080 + protocol: TCP + volumeMounts: + - name: rootfs + mountPath: /rootfs + readOnly: true + - name: var-run + mountPath: /var/run + readOnly: true + - name: sys + mountPath: /sys + readOnly: true + - name: docker + mountPath: /var/lib/docker + readOnly: true + - name: disk + mountPath: /dev/disk + readOnly: true + {{- with .Values.cadvisor.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + securityContext: + privileged: true + volumes: + - name: rootfs + hostPath: + path: / + - name: var-run + hostPath: + path: /var/run + - name: sys + hostPath: + path: /sys + - name: docker + hostPath: + path: /var/lib/docker + - name: disk + hostPath: + path: /dev/disk + tolerations: + - effect: NoSchedule + operator: Exists +{{- end }} + + diff --git a/postgres_ai_helm/templates/cadvisor-service.yaml b/postgres_ai_helm/templates/cadvisor-service.yaml new file mode 100644 index 0000000..b2da9b7 --- /dev/null +++ b/postgres_ai_helm/templates/cadvisor-service.yaml @@ -0,0 +1,23 @@ +{{- if .Values.cadvisor.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-cadvisor + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: cadvisor +spec: + type: ClusterIP + clusterIP: None + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: cadvisor +{{- end }} + + diff --git a/postgres_ai_helm/templates/flask-deployment.yaml b/postgres_ai_helm/templates/flask-deployment.yaml new file mode 100644 index 0000000..30f2f89 --- /dev/null +++ b/postgres_ai_helm/templates/flask-deployment.yaml @@ -0,0 +1,65 @@ +{{- if .Values.flask.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-flask + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: flask-api +spec: + replicas: {{ .Values.flask.replicas | default 2 }} + selector: + matchLabels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: flask-api + template: + metadata: + labels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: flask-api + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: wait-for-victoriametrics + image: busybox:1.36 + command: ['sh', '-c', 'until nc -z {{ include "postgres-ai-monitoring.fullname" . }}-victoriametrics {{ .Values.victoriaMetrics.service.port }}; do echo waiting for victoriametrics; sleep 2; done'] + containers: + - name: flask + image: {{ .Values.flask.image }} + imagePullPolicy: {{ .Values.flask.imagePullPolicy | default "IfNotPresent" }} + env: + - name: FLASK_ENV + value: "production" + - name: PROMETHEUS_URL + value: "http://{{ include "postgres-ai-monitoring.fullname" . }}-victoriametrics:{{ .Values.victoriaMetrics.service.port }}" + {{- range $key, $value := .Values.flask.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.flask.containerPort | default 8000 }} + protocol: TCP + {{- with .Values.flask.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + livenessProbe: + httpGet: + path: {{ .Values.flask.healthPath | default "/health" }} + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: {{ .Values.flask.healthPath | default "/health" }} + port: http + initialDelaySeconds: 5 + periodSeconds: 5 +{{- end }} + + diff --git a/postgres_ai_helm/templates/flask-service.yaml b/postgres_ai_helm/templates/flask-service.yaml new file mode 100644 index 0000000..225b6b5 --- /dev/null +++ b/postgres_ai_helm/templates/flask-service.yaml @@ -0,0 +1,22 @@ +{{- if .Values.flask.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-flask + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: flask-api +spec: + type: {{ .Values.flask.service.type | default "ClusterIP" }} + ports: + - port: {{ .Values.flask.service.port | default 8000 }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: flask-api +{{- end }} + + diff --git a/postgres_ai_helm/templates/grafana-dashboards.yaml b/postgres_ai_helm/templates/grafana-dashboards.yaml new file mode 100644 index 0000000..d04ba25 --- /dev/null +++ b/postgres_ai_helm/templates/grafana-dashboards.yaml @@ -0,0 +1,19 @@ +{{- if .Values.grafana.enabled }} +{{- $files := .Files }} +{{- range $path, $_ := .Files.Glob "config/grafana/dashboards/*.json" }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "postgres-ai-monitoring.fullname" $ }}-dashboard-{{ regexReplaceAll "[^a-z0-9-]" (lower (base $path | trimSuffix ".json")) "-" }} + namespace: {{ include "postgres-ai-monitoring.namespace" $ }} + labels: + {{- include "postgres-ai-monitoring.labels" $ | nindent 4 }} + app.kubernetes.io/component: grafana + grafana_dashboard: "1" +data: + {{ base $path | replace " " "-" }}: |- +{{ $files.Get $path | indent 4 }} +{{- end }} +{{- end }} + diff --git a/postgres_ai_helm/templates/grafana-datasources.yaml b/postgres_ai_helm/templates/grafana-datasources.yaml new file mode 100644 index 0000000..26f5272 --- /dev/null +++ b/postgres_ai_helm/templates/grafana-datasources.yaml @@ -0,0 +1,48 @@ +{{- if .Values.grafana.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-grafana-datasources + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: grafana + grafana_datasource: "1" +data: + datasources.yaml: | + apiVersion: 1 + datasources: + - name: VictoriaMetrics + type: prometheus + access: proxy + uid: P7A0D6631BB10B34F + url: http://{{ include "postgres-ai-monitoring.fullname" . }}-victoriametrics:{{ .Values.victoriaMetrics.service.port }} + isDefault: true + editable: true + - name: SinkPostgres + type: postgres + access: proxy + uid: P031DD592934B2F1F + url: {{ include "postgres-ai-monitoring.fullname" . }}-sink-postgres:5432 + user: {{ .Values.sinkPostgres.user }} + editable: true + jsonData: + database: {{ .Values.sinkPostgres.database }} + sslmode: disable + postgresVersion: 1500 + secureJsonData: + password: {{ .Values.secrets.postgres.password }} + {{- if .Values.flask.enabled }} + - name: Flask API + type: yesoreyeram-infinity-datasource + access: proxy + uid: aerffb0z8rjlsc + url: http://{{ include "postgres-ai-monitoring.fullname" . }}-flask:{{ .Values.flask.service.port }} + isDefault: false + editable: true + jsonData: + tlsSkipVerify: true + {{- end }} +{{- end }} + + diff --git a/postgres_ai_helm/templates/ingress.yaml b/postgres_ai_helm/templates/ingress.yaml new file mode 100644 index 0000000..43cbcdd --- /dev/null +++ b/postgres_ai_helm/templates/ingress.yaml @@ -0,0 +1,44 @@ +{{- if .Values.ingress.enabled }} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-grafana + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: grafana + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- if .Values.ingress.hosts.grafana }} + - host: {{ .Values.ingress.hosts.grafana | quote }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ .Release.Name }}-grafana + port: + number: 80 + {{- end }} +{{- end }} + + diff --git a/postgres_ai_helm/templates/node-exporter-daemonset.yaml b/postgres_ai_helm/templates/node-exporter-daemonset.yaml new file mode 100644 index 0000000..15ca695 --- /dev/null +++ b/postgres_ai_helm/templates/node-exporter-daemonset.yaml @@ -0,0 +1,56 @@ +{{- if .Values.nodeExporter.enabled }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-node-exporter + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: node-exporter +spec: + selector: + matchLabels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: node-exporter + template: + metadata: + labels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: node-exporter + spec: + hostNetwork: true + hostPID: true + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: node-exporter + image: {{ .Values.nodeExporter.image }} + imagePullPolicy: IfNotPresent + args: + - --path.rootfs=/host + - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) + ports: + - name: metrics + containerPort: 9100 + protocol: TCP + volumeMounts: + - name: root + mountPath: /host + readOnly: true + mountPropagation: HostToContainer + {{- with .Values.nodeExporter.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: root + hostPath: + path: / + tolerations: + - effect: NoSchedule + operator: Exists +{{- end }} + + diff --git a/postgres_ai_helm/templates/node-exporter-service.yaml b/postgres_ai_helm/templates/node-exporter-service.yaml new file mode 100644 index 0000000..f10d17f --- /dev/null +++ b/postgres_ai_helm/templates/node-exporter-service.yaml @@ -0,0 +1,23 @@ +{{- if .Values.nodeExporter.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-node-exporter + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: node-exporter +spec: + type: ClusterIP + clusterIP: None + ports: + - port: 9100 + targetPort: metrics + protocol: TCP + name: metrics + selector: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: node-exporter +{{- end }} + + diff --git a/postgres_ai_helm/templates/pgwatch-config.yaml b/postgres_ai_helm/templates/pgwatch-config.yaml new file mode 100644 index 0000000..bab8ae5 --- /dev/null +++ b/postgres_ai_helm/templates/pgwatch-config.yaml @@ -0,0 +1,125 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-config + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} +data: + sources-postgres.yml: | + {{- $root := . -}} + {{- range $db := .Values.monitoredDatabases }} + - name: {{ $db.name }} + {{- if $db.connStr }} + conn_str: {{ $db.connStr }} + {{- else }} + # Password will be injected by init container from secrets + conn_str: postgresql://{{ $db.user }}:PASSWORD_PLACEHOLDER_{{ $db.passwordSecretKey }}@{{ $db.host }}:{{ $db.port | default 5432 }}/{{ $db.database }} + {{- end }} + preset_metrics: {{ $db.presetMetrics | default "full" }} + {{- if $db.customMetrics }} + custom_metrics: + {{- toYaml $db.customMetrics | nindent 8 }} + {{- end }} + is_enabled: {{ $db.isEnabled | default true }} + group: {{ $db.group | default "default" }} + {{- $tags := dict -}} + {{- with $root.Values.global.customTags }} + {{- range $key, $value := . }} + {{- $_ := set $tags $key $value }} + {{- end }} + {{- end }} + {{- with $root.Values.global.clusterName }} + {{- if ne . "" }} + {{- $_ := set $tags "cluster" . }} + {{- end }} + {{- end }} + {{- with $root.Values.global.nodeName }} + {{- if ne . "" }} + {{- $_ := set $tags "node_name" . }} + {{- end }} + {{- end }} + {{- with $db.clusterName }} + {{- if ne . "" }} + {{- $_ := set $tags "cluster" . }} + {{- end }} + {{- end }} + {{- with $db.nodeName }} + {{- if ne . "" }} + {{- $_ := set $tags "node_name" . }} + {{- end }} + {{- end }} + {{- with $db.customTags }} + {{- range $key, $value := . }} + {{- $_ := set $tags $key $value }} + {{- end }} + {{- end }} + custom_tags: + {{- if gt (len $tags) 0 }} +{{ toYaml $tags | indent 8 }} + {{- end }} + sink_type: postgresql + {{- end }} + + sources-prometheus.yml: | + {{- $root := . -}} + {{- range $db := .Values.monitoredDatabases }} + - name: {{ $db.name }} + {{- if $db.connStr }} + conn_str: {{ $db.connStr }} + {{- else }} + # Password will be injected by init container from secrets + conn_str: postgresql://{{ $db.user }}:PASSWORD_PLACEHOLDER_{{ $db.passwordSecretKey }}@{{ $db.host }}:{{ $db.port | default 5432 }}/{{ $db.database }} + {{- end }} + preset_metrics: {{ $db.presetMetrics | default "full" }} + {{- if $db.customMetrics }} + custom_metrics: + {{- toYaml $db.customMetrics | nindent 8 }} + {{- end }} + is_enabled: {{ $db.isEnabled | default true }} + group: {{ $db.group | default "default" }} + {{- $tags := dict -}} + {{- with $root.Values.global.customTags }} + {{- range $key, $value := . }} + {{- $_ := set $tags $key $value }} + {{- end }} + {{- end }} + {{- with $root.Values.global.clusterName }} + {{- if ne . "" }} + {{- $_ := set $tags "cluster" . }} + {{- end }} + {{- end }} + {{- with $root.Values.global.nodeName }} + {{- if ne . "" }} + {{- $_ := set $tags "node_name" . }} + {{- end }} + {{- end }} + {{- with $db.clusterName }} + {{- if ne . "" }} + {{- $_ := set $tags "cluster" . }} + {{- end }} + {{- end }} + {{- with $db.nodeName }} + {{- if ne . "" }} + {{- $_ := set $tags "node_name" . }} + {{- end }} + {{- end }} + {{- with $db.customTags }} + {{- range $key, $value := . }} + {{- $_ := set $tags $key $value }} + {{- end }} + {{- end }} + custom_tags: + {{- if gt (len $tags) 0 }} +{{ toYaml $tags | indent 8 }} + {{- end }} + sink_type: prometheus + {{- end }} + + metrics-postgres.yml: | +{{ .Files.Get "config/metrics-postgres.yml" | indent 4 }} + + metrics-prometheus.yml: | +{{ .Files.Get "config/metrics-prometheus.yml" | indent 4 }} + + diff --git a/postgres_ai_helm/templates/pgwatch-postgres-deployment.yaml b/postgres_ai_helm/templates/pgwatch-postgres-deployment.yaml new file mode 100644 index 0000000..3183113 --- /dev/null +++ b/postgres_ai_helm/templates/pgwatch-postgres-deployment.yaml @@ -0,0 +1,109 @@ +{{- if .Values.pgwatchPostgres.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-postgres + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: pgwatch-postgres +spec: + replicas: 1 + selector: + matchLabels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: pgwatch-postgres + template: + metadata: + annotations: + checksum/config: {{ include (print $.Template.BasePath "/pgwatch-config.yaml") . | sha256sum }} + labels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: pgwatch-postgres + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: wait-for-sink + image: busybox:1.36 + command: ['sh', '-c', 'until nc -z {{ include "postgres-ai-monitoring.fullname" . }}-sink-postgres 5432; do echo waiting for postgres; sleep 2; done'] + - name: inject-passwords + image: busybox:1.36 + command: + - sh + - -c + - | + set -eu + # Copy sources file from configmap + cp /config/sources-postgres.yml /output/sources-postgres.yml + {{- range $db := .Values.monitoredDatabases }} + {{- if not $db.connStr }} + # Replace password placeholder for {{ $db.name }} + if [ -n "${DB_PASSWORD_{{ $db.passwordSecretKey | upper | replace "-" "_" }}:-}" ]; then + sed -i "s|PASSWORD_PLACEHOLDER_{{ $db.passwordSecretKey }}|${DB_PASSWORD_{{ $db.passwordSecretKey | upper | replace "-" "_" }}}|g" /output/sources-postgres.yml + fi + {{- end }} + {{- end }} + volumeMounts: + - name: config-template + mountPath: /config + - name: sources-config + mountPath: /output + env: + {{- range $db := .Values.monitoredDatabases }} + {{- if not $db.connStr }} + - name: DB_PASSWORD_{{ $db.passwordSecretKey | upper | replace "-" "_" }} + valueFrom: + secretKeyRef: + name: {{ include "postgres-ai-monitoring.secretName" $ }} + key: db-password-{{ $db.passwordSecretKey }} + {{- end }} + {{- end }} + containers: + - name: pgwatch + image: {{ .Values.pgwatchPostgres.image }} + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + exec /pgwatch/pgwatch \ + --sources=/etc/pgwatch/sources.yml \ + --metrics=/etc/pgwatch/metrics.yml \ + --sink=postgresql://{{ .Values.sinkPostgres.user }}:${POSTGRES_PASSWORD}@{{ include "postgres-ai-monitoring.fullname" . }}-sink-postgres:5432/{{ .Values.sinkPostgres.database }}?sslmode=disable \ + --web-addr=:8080 + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "postgres-ai-monitoring.secretName" . }} + key: postgres-password + ports: + - name: http + containerPort: 8080 + protocol: TCP + volumeMounts: + - name: sources-config + mountPath: /etc/pgwatch/sources.yml + subPath: sources-postgres.yml + - name: metrics-config + mountPath: /etc/pgwatch/metrics.yml + subPath: metrics-postgres.yml + {{- with .Values.pgwatchPostgres.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: config-template + configMap: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-config + - name: sources-config + emptyDir: {} + - name: metrics-config + configMap: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-config +{{- end }} + + diff --git a/postgres_ai_helm/templates/pgwatch-postgres-service.yaml b/postgres_ai_helm/templates/pgwatch-postgres-service.yaml new file mode 100644 index 0000000..cfb5d2b --- /dev/null +++ b/postgres_ai_helm/templates/pgwatch-postgres-service.yaml @@ -0,0 +1,22 @@ +{{- if .Values.pgwatchPostgres.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-postgres + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: pgwatch-postgres +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + selector: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: pgwatch-postgres +{{- end }} + + diff --git a/postgres_ai_helm/templates/pgwatch-prometheus-deployment.yaml b/postgres_ai_helm/templates/pgwatch-prometheus-deployment.yaml new file mode 100644 index 0000000..ea92e08 --- /dev/null +++ b/postgres_ai_helm/templates/pgwatch-prometheus-deployment.yaml @@ -0,0 +1,100 @@ +{{- if .Values.pgwatchPrometheus.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-prometheus + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: pgwatch-prometheus +spec: + replicas: 1 + selector: + matchLabels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: pgwatch-prometheus + template: + metadata: + annotations: + checksum/config: {{ include (print $.Template.BasePath "/pgwatch-config.yaml") . | sha256sum }} + labels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: pgwatch-prometheus + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: inject-passwords + image: busybox:1.36 + command: + - sh + - -c + - | + set -eu + # Copy sources file from configmap + cp /config/sources-prometheus.yml /output/sources-prometheus.yml + {{- range $db := .Values.monitoredDatabases }} + {{- if not $db.connStr }} + # Replace password placeholder for {{ $db.name }} + if [ -n "${DB_PASSWORD_{{ $db.passwordSecretKey | upper | replace "-" "_" }}:-}" ]; then + sed -i "s|PASSWORD_PLACEHOLDER_{{ $db.passwordSecretKey }}|${DB_PASSWORD_{{ $db.passwordSecretKey | upper | replace "-" "_" }}}|g" /output/sources-prometheus.yml + fi + {{- end }} + {{- end }} + volumeMounts: + - name: config-template + mountPath: /config + - name: sources-config + mountPath: /output + env: + {{- range $db := .Values.monitoredDatabases }} + {{- if not $db.connStr }} + - name: DB_PASSWORD_{{ $db.passwordSecretKey | upper | replace "-" "_" }} + valueFrom: + secretKeyRef: + name: {{ include "postgres-ai-monitoring.secretName" $ }} + key: db-password-{{ $db.passwordSecretKey }} + {{- end }} + {{- end }} + containers: + - name: pgwatch + image: {{ .Values.pgwatchPrometheus.image }} + imagePullPolicy: IfNotPresent + command: + - /pgwatch/pgwatch + - --sources=/etc/pgwatch/sources.yml + - --metrics=/etc/pgwatch/metrics.yml + - --sink=prometheus://0.0.0.0:9091/pgwatch + - --web-addr=:8089 + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: metrics + containerPort: 9091 + protocol: TCP + volumeMounts: + - name: sources-config + mountPath: /etc/pgwatch/sources.yml + subPath: sources-prometheus.yml + - name: metrics-config + mountPath: /etc/pgwatch/metrics.yml + subPath: metrics-prometheus.yml + {{- with .Values.pgwatchPrometheus.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: config-template + configMap: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-config + - name: sources-config + emptyDir: {} + - name: metrics-config + configMap: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-config +{{- end }} + + diff --git a/postgres_ai_helm/templates/pgwatch-prometheus-service.yaml b/postgres_ai_helm/templates/pgwatch-prometheus-service.yaml new file mode 100644 index 0000000..0c63955 --- /dev/null +++ b/postgres_ai_helm/templates/pgwatch-prometheus-service.yaml @@ -0,0 +1,26 @@ +{{- if .Values.pgwatchPrometheus.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-prometheus + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: pgwatch-prometheus +spec: + type: ClusterIP + ports: + - port: 8089 + targetPort: http + protocol: TCP + name: http + - port: 9091 + targetPort: metrics + protocol: TCP + name: metrics + selector: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: pgwatch-prometheus +{{- end }} + + diff --git a/postgres_ai_helm/templates/postgres-exporter-deployment.yaml b/postgres_ai_helm/templates/postgres-exporter-deployment.yaml new file mode 100644 index 0000000..300c15d --- /dev/null +++ b/postgres_ai_helm/templates/postgres-exporter-deployment.yaml @@ -0,0 +1,68 @@ +{{- if .Values.postgresExporter.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-postgres-exporter + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: postgres-exporter +spec: + replicas: 1 + selector: + matchLabels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: postgres-exporter + template: + metadata: + labels: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: postgres-exporter + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + initContainers: + - name: wait-for-sink + image: busybox:1.36 + command: ['sh', '-c', 'until nc -z {{ include "postgres-ai-monitoring.fullname" . }}-sink-postgres 5432; do echo waiting for postgres; sleep 2; done'] + containers: + - name: postgres-exporter + image: {{ .Values.postgresExporter.image }} + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + export DATA_SOURCE_NAME="postgresql://{{ .Values.sinkPostgres.user }}:${POSTGRES_PASSWORD}@{{ include "postgres-ai-monitoring.fullname" . }}-sink-postgres:5432/{{ .Values.sinkPostgres.database }}?sslmode=disable" + exec /bin/postgres_exporter + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "postgres-ai-monitoring.secretName" . }} + key: postgres-password + ports: + - name: metrics + containerPort: 9187 + protocol: TCP + {{- with .Values.postgresExporter.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + livenessProbe: + httpGet: + path: / + port: metrics + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: / + port: metrics + initialDelaySeconds: 5 + periodSeconds: 5 +{{- end }} + + diff --git a/postgres_ai_helm/templates/postgres-exporter-service.yaml b/postgres_ai_helm/templates/postgres-exporter-service.yaml new file mode 100644 index 0000000..d6afad2 --- /dev/null +++ b/postgres_ai_helm/templates/postgres-exporter-service.yaml @@ -0,0 +1,22 @@ +{{- if .Values.postgresExporter.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-postgres-exporter + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: postgres-exporter +spec: + type: ClusterIP + ports: + - port: 9187 + targetPort: metrics + protocol: TCP + name: metrics + selector: + {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: postgres-exporter +{{- end }} + + diff --git a/postgres_ai_helm/templates/prometheus-configmap.yaml b/postgres_ai_helm/templates/prometheus-configmap.yaml new file mode 100644 index 0000000..ba0b21a --- /dev/null +++ b/postgres_ai_helm/templates/prometheus-configmap.yaml @@ -0,0 +1,49 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-prometheus-config + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} +data: + prometheus.yml: | + global: + scrape_interval: {{ .Values.victoriaMetrics.scrapeInterval | default "15s" }} + evaluation_interval: 15s + + scrape_configs: + - job_name: 'pgwatch-prometheus' + static_configs: + - targets: ['{{ include "postgres-ai-monitoring.fullname" . }}-pgwatch-prometheus:9091'] + metrics_path: '/metrics' + + - job_name: 'postgres-exporter' + static_configs: + - targets: ['{{ include "postgres-ai-monitoring.fullname" . }}-postgres-exporter:9187'] + + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - {{ include "postgres-ai-monitoring.namespace" . }} + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] + action: keep + regex: node-exporter + + - job_name: 'cadvisor' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - {{ include "postgres-ai-monitoring.namespace" . }} + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] + action: keep + regex: cadvisor + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:8080 + + diff --git a/postgres_ai_helm/templates/rbac.yaml b/postgres_ai_helm/templates/rbac.yaml new file mode 100644 index 0000000..e5ebc27 --- /dev/null +++ b/postgres_ai_helm/templates/rbac.yaml @@ -0,0 +1,41 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-discovery + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: + - configmaps + verbs: ["get"] + - nonResourceURLs: + - /metrics + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-discovery + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "postgres-ai-monitoring.fullname" . }}-discovery +subjects: + - kind: ServiceAccount + name: {{ include "postgres-ai-monitoring.serviceAccountName" . }} + namespace: {{ include "postgres-ai-monitoring.namespace" . }} +{{- end }} + + diff --git a/postgres_ai_helm/templates/reporter-cronjob.yaml b/postgres_ai_helm/templates/reporter-cronjob.yaml new file mode 100644 index 0000000..da3557f --- /dev/null +++ b/postgres_ai_helm/templates/reporter-cronjob.yaml @@ -0,0 +1,172 @@ +{{- if .Values.reporter.enabled }} +{{- $root := . }} +{{- $seenCombinations := dict }} +{{- $combinations := list }} +{{- range $db := .Values.monitoredDatabases }} + {{- $clusterName := "" }} + {{- $nodeName := "" }} + {{- if $db.clusterName }} + {{- $clusterName = $db.clusterName }} + {{- else if $root.Values.reporter.clusterName }} + {{- $clusterName = $root.Values.reporter.clusterName }} + {{- else if $root.Values.global.clusterName }} + {{- $clusterName = $root.Values.global.clusterName }} + {{- else }} + {{- $clusterName = "k8s-cluster" }} + {{- end }} + {{- if $db.nodeName }} + {{- $nodeName = $db.nodeName }} + {{- else if $root.Values.reporter.nodeName }} + {{- $nodeName = $root.Values.reporter.nodeName }} + {{- else if $root.Values.global.nodeName }} + {{- $nodeName = $root.Values.global.nodeName }} + {{- end }} + {{- $key := printf "%s|%s" $clusterName $nodeName }} + {{- if not (hasKey $seenCombinations $key) }} + {{- $_ := set $seenCombinations $key true }} + {{- $combinations = append $combinations (dict "cluster" $clusterName "nodeName" $nodeName) }} + {{- end }} +{{- end }} +{{- if and (eq (len $combinations) 0) (or $root.Values.reporter.clusterName $root.Values.global.clusterName) }} + {{- $clusterName := "" }} + {{- $nodeName := "" }} + {{- if $root.Values.reporter.clusterName }} + {{- $clusterName = $root.Values.reporter.clusterName }} + {{- else if $root.Values.global.clusterName }} + {{- $clusterName = $root.Values.global.clusterName }} + {{- else }} + {{- $clusterName = "k8s-cluster" }} + {{- end }} + {{- if $root.Values.reporter.nodeName }} + {{- $nodeName = $root.Values.reporter.nodeName }} + {{- else if $root.Values.global.nodeName }} + {{- $nodeName = $root.Values.global.nodeName }} + {{- end }} + {{- $combinations = append $combinations (dict "cluster" $clusterName "nodeName" $nodeName) }} +{{- end }} +{{- range $idx, $combo := $combinations }} +{{- $clusterName := $combo.cluster }} +{{- $nodeName := $combo.nodeName }} +{{- $suffix := "" }} +{{- if gt (len $combinations) 1 }} + {{- $safeCluster := $clusterName | replace "_" "-" | replace "." "-" | lower | trunc 20 }} + {{- $safeNode := $nodeName | replace "_" "-" | replace "." "-" | lower | trunc 10 }} + {{- if $safeNode }} + {{- $suffix = printf "-%s-%s" $safeCluster $safeNode }} + {{- else }} + {{- $suffix = printf "-%s" $safeCluster }} + {{- end }} +{{- end }} +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "postgres-ai-monitoring.fullname" $root }}-reporter{{ $suffix }} + namespace: {{ include "postgres-ai-monitoring.namespace" $root }} + labels: + {{- include "postgres-ai-monitoring.labels" $root | nindent 4 }} + app.kubernetes.io/component: reporter + {{- if gt (len $combinations) 1 }} + postgres.ai/cluster: {{ $clusterName | quote }} + {{- if $nodeName }} + postgres.ai/node: {{ $nodeName | quote }} + {{- end }} + {{- end }} +spec: + schedule: {{ $root.Values.reporter.schedule | quote }} + successfulJobsHistoryLimit: {{ $root.Values.reporter.successfulJobsHistoryLimit | default 3 }} + failedJobsHistoryLimit: {{ $root.Values.reporter.failedJobsHistoryLimit | default 3 }} + jobTemplate: + spec: + template: + metadata: + labels: + {{- include "postgres-ai-monitoring.selectorLabels" $root | nindent 12 }} + app.kubernetes.io/component: reporter + {{- if gt (len $combinations) 1 }} + postgres.ai/cluster: {{ $clusterName | quote }} + {{- if $nodeName }} + postgres.ai/node: {{ $nodeName | quote }} + {{- end }} + {{- end }} + spec: + {{- with $root.Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 12 }} + {{- end }} + restartPolicy: OnFailure + initContainers: + - name: wait-for-victoriametrics + image: busybox:1.36 + command: ['sh', '-c', 'until nc -z {{ include "postgres-ai-monitoring.fullname" $root }}-victoriametrics {{ $root.Values.victoriaMetrics.service.port }}; do echo waiting for victoriametrics; sleep 2; done'] + containers: + - name: reporter + image: {{ $root.Values.reporter.image }} + imagePullPolicy: {{ $root.Values.reporter.imagePullPolicy | default "IfNotPresent" }} + command: + - /bin/sh + - -c + - | + set -eu + if [ -n "${API_KEY:-}" ]; then + exec python postgres_reports.py \ + --prometheus-url "$PROMETHEUS_URL" \ + --postgres-sink-url "postgresql://{{ $root.Values.sinkPostgres.user }}:${POSTGRES_PASSWORD}@{{ include "postgres-ai-monitoring.fullname" $root }}-sink-postgres:5432/{{ $root.Values.sinkPostgres.database }}" \ + --cluster "$CLUSTER" \ + --node-name "$NODE_NAME" \ + --output /app/reports/report.json \ + --api-url "{{ $root.Values.reporter.apiUrl | default "https://postgres.ai/api/general" }}" \ + --project "{{ $root.Values.reporter.project | default "postgres-ai-monitoring" }}" \ + --token "$API_KEY" + else + exec python postgres_reports.py \ + --prometheus-url "$PROMETHEUS_URL" \ + --postgres-sink-url "postgresql://{{ $root.Values.sinkPostgres.user }}:${POSTGRES_PASSWORD}@{{ include "postgres-ai-monitoring.fullname" $root }}-sink-postgres:5432/{{ $root.Values.sinkPostgres.database }}" \ + --cluster "$CLUSTER" \ + --node-name "$NODE_NAME" \ + --output /app/reports/report.json \ + --no-upload + fi + env: + - name: PROMETHEUS_URL + value: "http://{{ include "postgres-ai-monitoring.fullname" $root }}-victoriametrics:{{ $root.Values.victoriaMetrics.service.port }}" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "postgres-ai-monitoring.secretName" $root }} + key: postgres-password + - name: CLUSTER + value: {{ $clusterName | default "k8s-cluster" | quote }} + - name: NODE_NAME +{{- if $nodeName }} + value: {{ $nodeName | quote }} +{{- else }} + valueFrom: + fieldRef: + fieldPath: spec.nodeName +{{- end }} + {{- if $root.Values.secrets.pgwatchConfig }} + - name: API_KEY + valueFrom: + secretKeyRef: + name: {{ include "postgres-ai-monitoring.secretName" $root }} + key: pgai-api-key + optional: true + {{- end }} + + {{- range $key, $value := $root.Values.reporter.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- with $root.Values.reporter.resources }} + resources: + {{- toYaml . | nindent 16 }} + {{- end }} + volumeMounts: + - name: reports + mountPath: /app/reports + volumes: + - name: reports + emptyDir: {} +{{- end }} +{{- end }} diff --git a/postgres_ai_helm/templates/secret.yaml b/postgres_ai_helm/templates/secret.yaml new file mode 100644 index 0000000..cb6ab34 --- /dev/null +++ b/postgres_ai_helm/templates/secret.yaml @@ -0,0 +1,28 @@ +{{- $createFromValues := .Values.secrets.createFromValues | default false }} +{{- $existingSecretName := "" }} +{{- if .Values.existingSecret }} + {{- $existingSecretName = .Values.existingSecret.name | default "" }} +{{- end }} +{{- if and $createFromValues (or (not $existingSecretName) (eq $existingSecretName "")) }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "postgres-ai-monitoring.secretName" . }} + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} +type: Opaque +stringData: + postgres-password: {{ .Values.secrets.postgres.password | quote }} + grafana-admin-user: {{ .Values.secrets.grafana.adminUser | quote }} + grafana-admin-password: {{ .Values.secrets.grafana.adminPassword | quote }} + {{- if .Values.secrets.pgwatchConfig }} + pgai-api-key: {{ .Values.secrets.pgwatchConfig.apiKey | default "" | quote }} + {{- end }} + {{- if .Values.secrets.monitoredDatabases }} + {{- range $key, $value := .Values.secrets.monitoredDatabases }} + db-password-{{ $key }}: {{ $value | quote }} + {{- end }} + {{- end }} +{{- end }} + diff --git a/postgres_ai_helm/templates/serviceaccount.yaml b/postgres_ai_helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..e2b06a9 --- /dev/null +++ b/postgres_ai_helm/templates/serviceaccount.yaml @@ -0,0 +1,15 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "postgres-ai-monitoring.serviceAccountName" . }} + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} + + diff --git a/postgres_ai_helm/templates/sink-postgres-configmap.yaml b/postgres_ai_helm/templates/sink-postgres-configmap.yaml new file mode 100644 index 0000000..77982a6 --- /dev/null +++ b/postgres_ai_helm/templates/sink-postgres-configmap.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "postgres-ai-monitoring.fullname" . }}-sink-postgres-init + namespace: {{ include "postgres-ai-monitoring.namespace" . }} + labels: + {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} + app.kubernetes.io/component: sink-postgres +data: + 00-configure-pg-hba.sh: | + #!/bin/bash + set -e + # Configure pg_hba.conf to allow passwordless connections from within cluster + cat > "$PGDATA/pg_hba.conf" <-postgres-ai-monitoring-secrets (or -secrets) + # Set admin.existingSecret manually if using custom release name or fullnameOverride + admin: + existingSecret: "" # Set to: -postgres-ai-monitoring-secrets (or configure manually) + userKey: grafana-admin-user + passwordKey: grafana-admin-password + plugins: + - yesoreyeram-infinity-datasource + + + persistence: + enabled: true + size: 5Gi + storageClassName: "" + + service: + type: ClusterIP + port: 80 + + ingress: + enabled: false + + sidecar: + datasources: + enabled: true + label: grafana_datasource + dashboards: + enabled: true + label: grafana_dashboard + + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: default + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /tmp/dashboards \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..d56cd31 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,8 @@ +[pytest] +addopts = -ra --import-mode=importlib +pythonpath = . +testpaths = tests +markers = + unit: Marks fast unit tests that mock external services. + integration: Marks tests that talk to real services like PostgreSQL. + requires_postgres: Alias for tests needing a live Postgres instance. diff --git a/reporter/Dockerfile b/reporter/Dockerfile new file mode 100644 index 0000000..f9fa751 --- /dev/null +++ b/reporter/Dockerfile @@ -0,0 +1,30 @@ +FROM python:3.11-slim + +ARG VERSION +RUN test -n "${VERSION}" || (echo "VERSION build arg is required" && exit 1) + +LABEL org.opencontainers.image.title="PostgresAI Reporter" +LABEL org.opencontainers.image.description="Automated Postgres health check and monitoring reports" +LABEL org.opencontainers.image.vendor="PostgresAI" +LABEL org.opencontainers.image.source="https://github.com/PostgresAI/postgres-ai-monitoring" +LABEL org.opencontainers.image.version="${VERSION}" + +# Set working directory +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy reporter script +COPY postgres_reports.py . + +# Make script executable +RUN chmod +x postgres_reports.py + +# Create reports directory +RUN mkdir -p /app/reports + +# Default command +CMD ["python", "postgres_reports.py"] + diff --git a/reporter/__init__.py b/reporter/__init__.py new file mode 100644 index 0000000..9e176ed --- /dev/null +++ b/reporter/__init__.py @@ -0,0 +1 @@ +"""Reporter package exposing report generation utilities.""" diff --git a/reporter/postgres_reports.py b/reporter/postgres_reports.py index 50d500e..c003146 100644 --- a/reporter/postgres_reports.py +++ b/reporter/postgres_reports.py @@ -6,10 +6,12 @@ by querying Prometheus metrics using PromQL queries. """ +__version__ = "1.0.2" + import requests import json import time -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Dict, List, Any, Optional import argparse import sys @@ -19,8 +21,12 @@ class PostgresReportGenerator: + # Default databases to always exclude + DEFAULT_EXCLUDED_DATABASES = {'template0', 'template1', 'rdsadmin', 'azure_maintenance', 'cloudsqladmin'} + def __init__(self, prometheus_url: str = "http://sink-prometheus:9090", - postgres_sink_url: str = "postgresql://pgwatch@sink-postgres:5432/measurements"): + postgres_sink_url: str = "postgresql://pgwatch@sink-postgres:5432/measurements", + excluded_databases: Optional[List[str]] = None): """ Initialize the PostgreSQL report generator. @@ -28,11 +34,16 @@ def __init__(self, prometheus_url: str = "http://sink-prometheus:9090", prometheus_url: URL of the Prometheus instance (default: http://sink-prometheus:9090) postgres_sink_url: Connection string for the Postgres sink database (default: postgresql://pgwatch@sink-postgres:5432/measurements) + excluded_databases: Additional databases to exclude from reports """ self.prometheus_url = prometheus_url self.base_url = f"{prometheus_url}/api/v1" self.postgres_sink_url = postgres_sink_url self.pg_conn = None + # Combine default exclusions with user-provided exclusions + self.excluded_databases = self.DEFAULT_EXCLUDED_DATABASES.copy() + if excluded_databases: + self.excluded_databases.update(excluded_databases) def test_connection(self) -> bool: """Test connection to Prometheus.""" @@ -144,6 +155,62 @@ def query_instant(self, query: str) -> Dict[str, Any]: print(f"Query error: {e}") return {} + def _get_postgres_version_info(self, cluster: str, node_name: str) -> Dict[str, str]: + """ + Fetch and parse Postgres version information from pgwatch settings metrics. + + Notes: + - This helper is intentionally defensive: it validates the returned setting_name label + (tests may stub query responses broadly by metric name substring). + - Uses a single query with a regex on setting_name to reduce roundtrips. + """ + query = ( + f'last_over_time(pgwatch_settings_configured{{' + f'cluster="{cluster}", node_name="{node_name}", ' + f'setting_name=~"server_version|server_version_num"}}[3h])' + ) + + result = self.query_instant(query) + version_str = None + version_num = None + + if result.get("status") == "success": + if result.get("data", {}).get("result"): + for item in result["data"]["result"]: + metric = item.get("metric", {}) or {} + setting_name = metric.get("setting_name", "") + setting_value = metric.get("setting_value", "") + if setting_name == "server_version" and setting_value: + version_str = setting_value + elif setting_name == "server_version_num" and setting_value: + version_num = setting_value + else: + print(f"Warning: No version data found (cluster={cluster}, node_name={node_name})") + else: + print(f"Warning: Version query failed (cluster={cluster}, node_name={node_name}): status={result.get('status')}") + + server_version = version_str or "Unknown" + version_info: Dict[str, str] = { + "version": server_version, + "server_version_num": version_num or "Unknown", + "server_major_ver": "Unknown", + "server_minor_ver": "Unknown", + } + + if server_version != "Unknown": + # Handle both formats: + # - "15.3" + # - "15.3 (Ubuntu 15.3-1.pgdg20.04+1)" + version_parts = server_version.split()[0].split(".") + if len(version_parts) >= 1 and version_parts[0]: + version_info["server_major_ver"] = version_parts[0] + if len(version_parts) >= 2: + version_info["server_minor_ver"] = ".".join(version_parts[1:]) + else: + version_info["server_minor_ver"] = "0" + + return version_info + def generate_a002_version_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: """ Generate A002 Version Information report. @@ -155,35 +222,8 @@ def generate_a002_version_report(self, cluster: str = "local", node_name: str = Returns: Dictionary containing version information """ - print("Generating A002 Version Information report...") - settings_query = f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}' - # Query PostgreSQL version information - - version_queries = { - 'server_version': f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}", setting_name="server_version"}}', - 'server_version_num': f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}", setting_name="server_version_num"}}', - 'max_connections': f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}", setting_name="max_connections"}}', - 'shared_buffers': f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}", setting_name="shared_buffers"}}', - 'effective_cache_size': f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}", setting_name="effective_cache_size"}}', - } - - version_data = {} - for metric_name, query in version_queries.items(): - result = self.query_instant(query) - if result.get('status') == 'success' and result.get('data', {}).get('result'): - latest_value = result['data']['result'][0]['metric'].get('setting_value', None) - version_data[metric_name] = latest_value - - # Format the version data - version_info = { - "version": version_data.get('server_version', 'Unknown'), - "server_version_num": version_data.get('server_version_num', 'Unknown'), - "server_major_ver": version_data.get('server_version', '').split('.')[0] if version_data.get( - 'server_version') else 'Unknown', - "server_minor_ver": version_data.get('server_version', '').split('.', 1)[1] if version_data.get( - 'server_version') and '.' in version_data.get('server_version', '') else 'Unknown' - } - + print(f"Generating A002 Version Information report for cluster='{cluster}', node_name='{node_name}'...") + version_info = self._get_postgres_version_info(cluster, node_name) return self.format_report_data("A002", {"version": version_info}, node_name) def generate_a003_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: @@ -199,17 +239,21 @@ def generate_a003_settings_report(self, cluster: str = "local", node_name: str = """ print("Generating A003 PostgreSQL Settings report...") - # Query all PostgreSQL settings using the pgwatch_settings_setting metric + # Query all PostgreSQL settings using the pgwatch_settings_configured metric with last_over_time # This metric has labels for each setting name - settings_query = f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}' + settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' result = self.query_instant(settings_query) settings_data = {} if result.get('status') == 'success' and result.get('data', {}).get('result'): for item in result['data']['result']: # Extract setting name from labels - setting_name = item['metric'].get('setting_name', 'unknown') + setting_name = item['metric'].get('setting_name', '') setting_value = item['metric'].get('setting_value', '') + + # Skip if we don't have a setting name + if not setting_name: + continue # Get additional metadata from labels category = item['metric'].get('category', 'Other') @@ -225,8 +269,12 @@ def generate_a003_settings_report(self, cluster: str = "local", node_name: str = "vartype": vartype, "pretty_value": self.format_setting_value(setting_name, setting_value, unit) } + else: + print(f"Warning: A003 - No settings data returned for cluster={cluster}, node_name={node_name}") + print(f"Query result status: {result.get('status')}") + print(f"Query result data: {result.get('data', {})}") - return self.format_report_data("A003", settings_data, node_name) + return self.format_report_data("A003", settings_data, node_name, postgres_version=self._get_postgres_version_info(cluster, node_name)) def generate_a004_cluster_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: """ @@ -243,16 +291,16 @@ def generate_a004_cluster_report(self, cluster: str = "local", node_name: str = # Query cluster information cluster_queries = { - 'active_connections': f'sum(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}", state="active"}})', - 'idle_connections': f'sum(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}", state="idle"}})', - 'total_connections': f'sum(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}"}})', - 'database_size': f'sum(pgwatch_pg_database_size_bytes{{cluster="{cluster}", node_name="{node_name}"}})', - 'cache_hit_ratio': f'sum(pgwatch_db_stats_blks_hit{{cluster="{cluster}", node_name="{node_name}"}}) / (sum(pgwatch_db_stats_blks_hit{{cluster="{cluster}", node_name="{node_name}"}}) + sum(pgwatch_db_stats_blks_read{{cluster="{cluster}", node_name="{node_name}"}})) * 100', + 'active_connections': f'sum(last_over_time(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}", state="active"}}[3h]))', + 'idle_connections': f'sum(last_over_time(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}", state="idle"}}[3h]))', + 'total_connections': f'sum(last_over_time(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', + 'database_sizes': f'sum(last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', + 'cache_hit_ratio': f'sum(last_over_time(pgwatch_db_stats_blks_hit{{cluster="{cluster}", node_name="{node_name}"}}[3h])) / clamp_min(sum(last_over_time(pgwatch_db_stats_blks_hit{{cluster="{cluster}", node_name="{node_name}"}}[3h])) + sum(last_over_time(pgwatch_db_stats_blks_read{{cluster="{cluster}", node_name="{node_name}"}}[3h])), 1) * 100', 'transactions_per_sec': f'sum(rate(pgwatch_db_stats_xact_commit{{cluster="{cluster}", node_name="{node_name}"}}[5m])) + sum(rate(pgwatch_db_stats_xact_rollback{{cluster="{cluster}", node_name="{node_name}"}}[5m]))', 'checkpoints_per_sec': f'sum(rate(pgwatch_pg_stat_bgwriter_checkpoints_timed{{cluster="{cluster}", node_name="{node_name}"}}[5m])) + sum(rate(pgwatch_pg_stat_bgwriter_checkpoints_req{{cluster="{cluster}", node_name="{node_name}"}}[5m]))', - 'deadlocks': f'sum(pgwatch_db_stats_deadlocks{{cluster="{cluster}", node_name="{node_name}"}})', - 'temp_files': f'sum(pgwatch_db_stats_temp_files{{cluster="{cluster}", node_name="{node_name}"}})', - 'temp_bytes': f'sum(pgwatch_db_stats_temp_bytes{{cluster="{cluster}", node_name="{node_name}"}})', + 'deadlocks': f'sum(last_over_time(pgwatch_db_stats_deadlocks{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', + 'temp_files': f'sum(last_over_time(pgwatch_db_stats_temp_files{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', + 'temp_bytes': f'sum(last_over_time(pgwatch_db_stats_temp_bytes{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', } cluster_data = {} @@ -269,7 +317,7 @@ def generate_a004_cluster_report(self, cluster: str = "local", node_name: str = } # Get database sizes - db_sizes_query = f'pgwatch_pg_database_size_bytes{{cluster="{cluster}", node_name="{node_name}"}}' + db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' db_sizes_result = self.query_instant(db_sizes_query) database_sizes = {} @@ -279,10 +327,15 @@ def generate_a004_cluster_report(self, cluster: str = "local", node_name: str = size_bytes = float(result['value'][1]) database_sizes[db_name] = size_bytes - return self.format_report_data("A004", { - "general_info": cluster_data, - "database_sizes": database_sizes - }, node_name) + return self.format_report_data( + "A004", + { + "general_info": cluster_data, + "database_sizes": database_sizes, + }, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def generate_a007_altered_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ str, Any]: @@ -298,22 +351,24 @@ def generate_a007_altered_settings_report(self, cluster: str = "local", node_nam """ print("Generating A007 Altered Settings report...") - # Query settings by source using the pgwatch_settings_setting metric - settings_by_source_query = f'pgwatch_settings_is_default{{cluster="{cluster}", node_name="{node_name}"}} < 1' + # Query settings by source using the pgwatch_settings_is_default metric with last_over_time + # This returns settings where is_default = 0 (i.e., non-default/altered settings) + settings_by_source_query = f'last_over_time(pgwatch_settings_is_default{{cluster="{cluster}", node_name="{node_name}"}}[3h]) < 1' result = self.query_instant(settings_by_source_query) - settings_count = {} - changes = [] - + altered_settings = {} if result.get('status') == 'success' and result.get('data', {}).get('result'): - # Group settings by source - altered_settings = {} for item in result['data']['result']: - # Extract source from labels - setting_name = item['metric'].get('setting_name', 'unknown') - value = item['metric'].get('setting_value', 'unknown') + # Extract setting information from labels + setting_name = item['metric'].get('setting_name', '') + value = item['metric'].get('setting_value', '') unit = item['metric'].get('unit', '') - category = item['metric'].get('category', 'unknown') + category = item['metric'].get('category', 'Other') + + # Skip if we don't have a setting name + if not setting_name: + continue + pretty_value = self.format_setting_value(setting_name, value, unit) altered_settings[setting_name] = { "value": value, @@ -321,8 +376,11 @@ def generate_a007_altered_settings_report(self, cluster: str = "local", node_nam "category": category, "pretty_value": pretty_value } + else: + print(f"Warning: A007 - No altered settings data returned for cluster={cluster}, node_name={node_name}") + print(f"Query result status: {result.get('status')}") - return self.format_report_data("A007", altered_settings, node_name) + return self.format_report_data("A007", altered_settings, node_name, postgres_version=self._get_postgres_version_info(cluster, node_name)) def generate_h001_invalid_indexes_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ str, Any]: @@ -341,10 +399,21 @@ def generate_h001_invalid_indexes_report(self, cluster: str = "local", node_name # Get all databases databases = self.get_all_databases(cluster, node_name) + # Get database sizes + db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + db_sizes_result = self.query_instant(db_sizes_query) + database_sizes = {} + + if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): + for result in db_sizes_result['data']['result']: + db_name = result['metric'].get('datname', 'unknown') + size_bytes = float(result['value'][1]) + database_sizes[db_name] = size_bytes + invalid_indexes_by_db = {} for db_name in databases: # Query invalid indexes for each database - invalid_indexes_query = f'pgwatch_pg_invalid_indexes{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}' + invalid_indexes_query = f'last_over_time(pgwatch_pg_invalid_indexes{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])' result = self.query_instant(invalid_indexes_query) invalid_indexes = [] @@ -375,14 +444,22 @@ def generate_h001_invalid_indexes_report(self, cluster: str = "local", node_name invalid_indexes.append(invalid_index) total_size += index_size_bytes + db_size_bytes = database_sizes.get(db_name, 0) invalid_indexes_by_db[db_name] = { "invalid_indexes": invalid_indexes, "total_count": len(invalid_indexes), "total_size_bytes": total_size, - "total_size_pretty": self.format_bytes(total_size) + "total_size_pretty": self.format_bytes(total_size), + "database_size_bytes": db_size_bytes, + "database_size_pretty": self.format_bytes(db_size_bytes) } - return self.format_report_data("H001", invalid_indexes_by_db, node_name) + return self.format_report_data( + "H001", + invalid_indexes_by_db, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: """ @@ -400,8 +477,19 @@ def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: # Get all databases databases = self.get_all_databases(cluster, node_name) + # Get database sizes + db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + db_sizes_result = self.query_instant(db_sizes_query) + database_sizes = {} + + if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): + for result in db_sizes_result['data']['result']: + db_name = result['metric'].get('datname', 'unknown') + size_bytes = float(result['value'][1]) + database_sizes[db_name] = size_bytes + # Query postmaster uptime to get startup time - postmaster_uptime_query = f'last_over_time(pgwatch_db_stats_postmaster_uptime_s{{cluster="{cluster}", node_name="{node_name}"}}[10h])' + postmaster_uptime_query = f'last_over_time(pgwatch_db_stats_postmaster_uptime_s{{cluster="{cluster}", node_name="{node_name}"}}[3h])' postmaster_uptime_result = self.query_instant(postmaster_uptime_query) postmaster_startup_time = None @@ -417,7 +505,7 @@ def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: # Get index definitions from Postgres sink database for this specific database index_definitions = self.get_index_definitions_from_sink(db_name) # Query stats_reset timestamp for this database - stats_reset_query = f'last_over_time(pgwatch_stats_reset_stats_reset_epoch{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}"}}[10h])' + stats_reset_query = f'last_over_time(pgwatch_stats_reset_stats_reset_epoch{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}"}}[3h])' stats_reset_result = self.query_instant(stats_reset_query) stats_reset_epoch = None @@ -431,7 +519,7 @@ def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: days_since_reset = (datetime.now() - datetime.fromtimestamp(stats_reset_epoch)).days # Query unused indexes for each database using last_over_time to get most recent value - unused_indexes_query = f'last_over_time(pgwatch_unused_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}"}}[10h])' + unused_indexes_query = f'last_over_time(pgwatch_unused_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}"}}[3h])' unused_result = self.query_instant(unused_indexes_query) unused_indexes = [] @@ -446,7 +534,7 @@ def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: index_size_bytes = float(item['value'][1]) if item.get('value') else 0 # Query other related metrics for this index - idx_scan_query = f'last_over_time(pgwatch_unused_indexes_idx_scan{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[10h])' + idx_scan_query = f'last_over_time(pgwatch_unused_indexes_idx_scan{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' idx_scan_result = self.query_instant(idx_scan_query) idx_scan = float(idx_scan_result['data']['result'][0]['value'][1]) if idx_scan_result.get('data', {}).get( @@ -476,11 +564,14 @@ def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: total_unused_size = sum(idx['index_size_bytes'] for idx in unused_indexes) + db_size_bytes = database_sizes.get(db_name, 0) unused_indexes_by_db[db_name] = { "unused_indexes": unused_indexes, "total_count": len(unused_indexes), "total_size_bytes": total_unused_size, "total_size_pretty": self.format_bytes(total_unused_size), + "database_size_bytes": db_size_bytes, + "database_size_pretty": self.format_bytes(db_size_bytes), "stats_reset": { "stats_reset_epoch": stats_reset_epoch, "stats_reset_time": stats_reset_time, @@ -490,7 +581,12 @@ def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: } } - return self.format_report_data("H002", unused_indexes_by_db, node_name) + return self.format_report_data( + "H002", + unused_indexes_by_db, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def generate_h004_redundant_indexes_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ str, Any]: @@ -509,10 +605,23 @@ def generate_h004_redundant_indexes_report(self, cluster: str = "local", node_na # Get all databases databases = self.get_all_databases(cluster, node_name) + # Get database sizes + db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + db_sizes_result = self.query_instant(db_sizes_query) + database_sizes = {} + + if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): + for result in db_sizes_result['data']['result']: + db_name = result['metric'].get('datname', 'unknown') + size_bytes = float(result['value'][1]) + database_sizes[db_name] = size_bytes + redundant_indexes_by_db = {} for db_name in databases: + # Fetch index definitions from the sink for this database (used to aid remediation) + index_definitions = self.get_index_definitions_from_sink(db_name) # Query redundant indexes for each database using last_over_time to get most recent value - redundant_indexes_query = f'last_over_time(pgwatch_redundant_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}"}}[10h])' + redundant_indexes_query = f'last_over_time(pgwatch_redundant_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}"}}[3h])' result = self.query_instant(redundant_indexes_query) redundant_indexes = [] @@ -531,18 +640,18 @@ def generate_h004_redundant_indexes_report(self, cluster: str = "local", node_na index_size_bytes = float(item['value'][1]) if item.get('value') else 0 # Query other related metrics for this index - table_size_query = f'last_over_time(pgwatch_redundant_indexes_table_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[10h])' + table_size_query = f'last_over_time(pgwatch_redundant_indexes_table_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' table_size_result = self.query_instant(table_size_query) table_size_bytes = float( table_size_result['data']['result'][0]['value'][1]) if table_size_result.get('data', {}).get( 'result') else 0 - index_usage_query = f'last_over_time(pgwatch_redundant_indexes_index_usage{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[10h])' + index_usage_query = f'last_over_time(pgwatch_redundant_indexes_index_usage{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' index_usage_result = self.query_instant(index_usage_query) index_usage = float(index_usage_result['data']['result'][0]['value'][1]) if index_usage_result.get( 'data', {}).get('result') else 0 - supports_fk_query = f'last_over_time(pgwatch_redundant_indexes_supports_fk{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[10h])' + supports_fk_query = f'last_over_time(pgwatch_redundant_indexes_supports_fk{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' supports_fk_result = self.query_instant(supports_fk_query) supports_fk = bool( int(supports_fk_result['data']['result'][0]['value'][1])) if supports_fk_result.get('data', @@ -560,6 +669,7 @@ def generate_h004_redundant_indexes_report(self, cluster: str = "local", node_na "table_size_bytes": table_size_bytes, "index_usage": index_usage, "supports_fk": supports_fk, + "index_definition": index_definitions.get(index_name, 'Definition not available'), "index_size_pretty": self.format_bytes(index_size_bytes), "table_size_pretty": self.format_bytes(table_size_bytes) } @@ -570,14 +680,22 @@ def generate_h004_redundant_indexes_report(self, cluster: str = "local", node_na # Sort by index size descending redundant_indexes.sort(key=lambda x: x['index_size_bytes'], reverse=True) + db_size_bytes = database_sizes.get(db_name, 0) redundant_indexes_by_db[db_name] = { "redundant_indexes": redundant_indexes, "total_count": len(redundant_indexes), "total_size_bytes": total_size, - "total_size_pretty": self.format_bytes(total_size) + "total_size_pretty": self.format_bytes(total_size), + "database_size_bytes": db_size_bytes, + "database_size_pretty": self.format_bytes(db_size_bytes) } - return self.format_report_data("H004", redundant_indexes_by_db, node_name) + return self.format_report_data( + "H004", + redundant_indexes_by_db, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def generate_d004_pgstat_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ str, Any]: @@ -608,17 +726,21 @@ def generate_d004_pgstat_settings_report(self, cluster: str = "local", node_name 'track_wal_io_timing' ] - # Query all PostgreSQL settings for pg_stat_statements and related - settings_query = f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}' + # Query all PostgreSQL settings for pg_stat_statements and related using last_over_time + settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' result = self.query_instant(settings_query) pgstat_data = {} if result.get('status') == 'success' and result.get('data', {}).get('result'): for item in result['data']['result']: - setting_name = item['metric'].get('setting_name', 'unknown') + setting_name = item['metric'].get('setting_name', '') + + # Skip if no setting name + if not setting_name: + continue # Filter for pg_stat_statements and related settings - if any(pgstat_setting in setting_name for pgstat_setting in pgstat_settings): + if setting_name in pgstat_settings: setting_value = item['metric'].get('setting_value', '') category = item['metric'].get('category', 'Statistics') unit = item['metric'].get('unit', '') @@ -633,6 +755,8 @@ def generate_d004_pgstat_settings_report(self, cluster: str = "local", node_name "vartype": vartype, "pretty_value": self.format_setting_value(setting_name, setting_value, unit) } + else: + print(f"Warning: D004 - No settings data returned for cluster={cluster}, node_name={node_name}") # Check if pg_stat_kcache extension is available and working by querying its metrics kcache_status = self._check_pg_stat_kcache_status(cluster, node_name) @@ -640,11 +764,16 @@ def generate_d004_pgstat_settings_report(self, cluster: str = "local", node_name # Check if pg_stat_statements is available and working by querying its metrics pgss_status = self._check_pg_stat_statements_status(cluster, node_name) - return self.format_report_data("D004", { - "settings": pgstat_data, - "pg_stat_statements_status": pgss_status, - "pg_stat_kcache_status": kcache_status - }, node_name) + return self.format_report_data( + "D004", + { + "settings": pgstat_data, + "pg_stat_statements_status": pgss_status, + "pg_stat_kcache_status": kcache_status, + }, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def _check_pg_stat_kcache_status(self, cluster: str, node_name: str) -> Dict[str, Any]: """ @@ -658,9 +787,9 @@ def _check_pg_stat_kcache_status(self, cluster: str, node_name: str) -> Dict[str Dictionary containing pg_stat_kcache status information """ kcache_queries = { - 'exec_user_time': f'pgwatch_pg_stat_kcache_exec_user_time{{cluster="{cluster}", node_name="{node_name}"}}', - 'exec_system_time': f'pgwatch_pg_stat_kcache_exec_system_time{{cluster="{cluster}", node_name="{node_name}"}}', - 'exec_total_time': f'pgwatch_pg_stat_kcache_exec_total_time{{cluster="{cluster}", node_name="{node_name}"}}' + 'exec_user_time': f'last_over_time(pgwatch_pg_stat_kcache_exec_user_time{{cluster="{cluster}", node_name="{node_name}"}}[3h])', + 'exec_system_time': f'last_over_time(pgwatch_pg_stat_kcache_exec_system_time{{cluster="{cluster}", node_name="{node_name}"}}[3h])', + 'exec_total_time': f'last_over_time(pgwatch_pg_stat_kcache_exec_total_time{{cluster="{cluster}", node_name="{node_name}"}}[3h])' } kcache_status = { @@ -713,7 +842,7 @@ def _check_pg_stat_statements_status(self, cluster: str, node_name: str) -> Dict Returns: Dictionary containing pg_stat_statements status information """ - pgss_query = f'pgwatch_pg_stat_statements_calls{{cluster="{cluster}", node_name="{node_name}"}}' + pgss_query = f'last_over_time(pgwatch_pg_stat_statements_calls{{cluster="{cluster}", node_name="{node_name}"}}[3h])' result = self.query_instant(pgss_query) pgss_status = { @@ -772,6 +901,7 @@ def generate_f001_autovacuum_settings_report(self, cluster: str = "local", node_ 'autovacuum_naptime', 'autovacuum_vacuum_cost_delay', 'autovacuum_vacuum_cost_limit', + 'autovacuum_vacuum_insert_scale_factor', 'autovacuum_vacuum_scale_factor', 'autovacuum_vacuum_threshold', 'autovacuum_work_mem', @@ -786,8 +916,8 @@ def generate_f001_autovacuum_settings_report(self, cluster: str = "local", node_ 'vacuum_multixact_freeze_table_age' ] - # Query all PostgreSQL settings for autovacuum - settings_query = f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}' + # Query all PostgreSQL settings for autovacuum using last_over_time + settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' result = self.query_instant(settings_query) autovacuum_data = {} @@ -812,7 +942,7 @@ def generate_f001_autovacuum_settings_report(self, cluster: str = "local", node_ "pretty_value": self.format_setting_value(setting_name, setting_value, unit) } - return self.format_report_data("F001", autovacuum_data, node_name) + return self.format_report_data("F001", autovacuum_data, node_name, postgres_version=self._get_postgres_version_info(cluster, node_name)) def generate_f005_btree_bloat_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: """ @@ -830,14 +960,25 @@ def generate_f005_btree_bloat_report(self, cluster: str = "local", node_name: st # Get all databases databases = self.get_all_databases(cluster, node_name) + # Get database sizes + db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + db_sizes_result = self.query_instant(db_sizes_query) + database_sizes = {} + + if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): + for result in db_sizes_result['data']['result']: + db_name = result['metric'].get('datname', 'unknown') + size_bytes = float(result['value'][1]) + database_sizes[db_name] = size_bytes + bloated_indexes_by_db = {} for db_name in databases: # Query btree bloat using multiple metrics for each database with last_over_time [1d] bloat_queries = { - 'extra_size': f'last_over_time(pgwatch_pg_btree_bloat_extra_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', - 'extra_pct': f'last_over_time(pgwatch_pg_btree_bloat_extra_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', - 'bloat_size': f'last_over_time(pgwatch_pg_btree_bloat_bloat_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', - 'bloat_pct': f'last_over_time(pgwatch_pg_btree_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', + 'extra_size': f'last_over_time(pgwatch_pg_btree_bloat_extra_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', + 'extra_pct': f'last_over_time(pgwatch_pg_btree_bloat_extra_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', + 'bloat_size': f'last_over_time(pgwatch_pg_btree_bloat_bloat_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', + 'bloat_pct': f'last_over_time(pgwatch_pg_btree_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', } bloated_indexes = {} @@ -880,14 +1021,22 @@ def generate_f005_btree_bloat_report(self, cluster: str = "local", node_name: st # Sort by bloat percentage descending bloated_indexes_list.sort(key=lambda x: x['bloat_pct'], reverse=True) + db_size_bytes = database_sizes.get(db_name, 0) bloated_indexes_by_db[db_name] = { "bloated_indexes": bloated_indexes_list, "total_count": len(bloated_indexes_list), "total_bloat_size_bytes": total_bloat_size, - "total_bloat_size_pretty": self.format_bytes(total_bloat_size) + "total_bloat_size_pretty": self.format_bytes(total_bloat_size), + "database_size_bytes": db_size_bytes, + "database_size_pretty": self.format_bytes(db_size_bytes) } - return self.format_report_data("F005", bloated_indexes_by_db, node_name) + return self.format_report_data( + "F005", + bloated_indexes_by_db, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def generate_g001_memory_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ str, Any]: @@ -930,14 +1079,18 @@ def generate_g001_memory_settings_report(self, cluster: str = "local", node_name 'max_stack_depth' ] - # Query all PostgreSQL settings for memory-related settings - settings_query = f'pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}' + # Query all PostgreSQL settings for memory-related settings using last_over_time + settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' result = self.query_instant(settings_query) memory_data = {} if result.get('status') == 'success' and result.get('data', {}).get('result'): for item in result['data']['result']: - setting_name = item['metric'].get('setting_name', 'unknown') + setting_name = item['metric'].get('setting_name', '') + + # Skip if no setting name + if not setting_name: + continue # Filter for memory-related settings if setting_name in memory_settings: @@ -955,14 +1108,21 @@ def generate_g001_memory_settings_report(self, cluster: str = "local", node_name "vartype": vartype, "pretty_value": self.format_setting_value(setting_name, setting_value, unit) } + else: + print(f"Warning: G001 - No settings data returned for cluster={cluster}, node_name={node_name}") # Calculate some memory usage estimates and recommendations memory_analysis = self._analyze_memory_settings(memory_data) - return self.format_report_data("G001", { - "settings": memory_data, - "analysis": memory_analysis - }, node_name) + return self.format_report_data( + "G001", + { + "settings": memory_data, + "analysis": memory_analysis, + }, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def _analyze_memory_settings(self, memory_data: Dict[str, Any]) -> Dict[str, Any]: """ @@ -1068,16 +1228,31 @@ def generate_f004_heap_bloat_report(self, cluster: str = "local", node_name: str # Get all databases databases = self.get_all_databases(cluster, node_name) + + if not databases: + print("Warning: F004 - No databases found") + + # Get database sizes + db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + db_sizes_result = self.query_instant(db_sizes_query) + database_sizes = {} + + if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): + for result in db_sizes_result['data']['result']: + db_name = result['metric'].get('datname', 'unknown') + size_bytes = float(result['value'][1]) + database_sizes[db_name] = size_bytes bloated_tables_by_db = {} for db_name in databases: # Query table bloat using multiple metrics for each database + # Try with 10h window first, then fall back to instant query bloat_queries = { - 'real_size': f'last_over_time(pgwatch_pg_table_bloat_real_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', - 'extra_size': f'last_over_time(pgwatch_pg_table_bloat_extra_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', - 'extra_pct': f'last_over_time(pgwatch_pg_table_bloat_extra_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', - 'bloat_size': f'last_over_time(pgwatch_pg_table_bloat_bloat_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', - 'bloat_pct': f'last_over_time(pgwatch_pg_table_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[1d])', + 'real_size': f'last_over_time(pgwatch_pg_table_bloat_real_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', + 'extra_size': f'last_over_time(pgwatch_pg_table_bloat_extra_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', + 'extra_pct': f'last_over_time(pgwatch_pg_table_bloat_extra_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', + 'bloat_size': f'last_over_time(pgwatch_pg_table_bloat_bloat_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', + 'bloat_pct': f'last_over_time(pgwatch_pg_table_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', } bloated_tables = {} @@ -1103,6 +1278,9 @@ def generate_f004_heap_bloat_report(self, cluster: str = "local", node_name: str value = float(item['value'][1]) if item.get('value') else 0 bloated_tables[table_key][metric_type] = value + else: + if metric_type == 'real_size': # Only log once per database + print(f"Warning: F004 - No bloat data for database {db_name}, metric {metric_type}") # Convert to list and add pretty formatting bloated_tables_list = [] @@ -1119,14 +1297,22 @@ def generate_f004_heap_bloat_report(self, cluster: str = "local", node_name: str # Sort by bloat percentage descending bloated_tables_list.sort(key=lambda x: x['bloat_pct'], reverse=True) + db_size_bytes = database_sizes.get(db_name, 0) bloated_tables_by_db[db_name] = { "bloated_tables": bloated_tables_list, "total_count": len(bloated_tables_list), "total_bloat_size_bytes": total_bloat_size, - "total_bloat_size_pretty": self.format_bytes(total_bloat_size) + "total_bloat_size_pretty": self.format_bytes(total_bloat_size), + "database_size_bytes": db_size_bytes, + "database_size_pretty": self.format_bytes(db_size_bytes) } - return self.format_report_data("F004", bloated_tables_by_db, node_name) + return self.format_report_data( + "F004", + bloated_tables_by_db, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def generate_k001_query_calls_report(self, cluster: str = "local", node_name: str = "node-01", time_range_minutes: int = 60) -> Dict[str, Any]: @@ -1145,6 +1331,9 @@ def generate_k001_query_calls_report(self, cluster: str = "local", node_name: st # Get all databases databases = self.get_all_databases(cluster, node_name) + + if not databases: + print("Warning: K001 - No databases found") # Calculate time range end_time = datetime.now() @@ -1152,9 +1341,13 @@ def generate_k001_query_calls_report(self, cluster: str = "local", node_name: st queries_by_db = {} for db_name in databases: + print(f"K001: Processing database {db_name}...") # Get pg_stat_statements metrics for this database query_metrics = self._get_pgss_metrics_data_by_db(cluster, node_name, db_name, start_time, end_time) + if not query_metrics: + print(f"Warning: K001 - No query metrics returned for database {db_name}") + # Sort by calls (descending) sorted_metrics = sorted(query_metrics, key=lambda x: x.get('calls', 0), reverse=True) @@ -1176,7 +1369,12 @@ def generate_k001_query_calls_report(self, cluster: str = "local", node_name: st } } - return self.format_report_data("K001", queries_by_db, node_name) + return self.format_report_data( + "K001", + queries_by_db, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def generate_k003_top_queries_report(self, cluster: str = "local", node_name: str = "node-01", time_range_minutes: int = 60, limit: int = 50) -> Dict[str, Any]: @@ -1196,6 +1394,9 @@ def generate_k003_top_queries_report(self, cluster: str = "local", node_name: st # Get all databases databases = self.get_all_databases(cluster, node_name) + + if not databases: + print("Warning: K003 - No databases found") # Calculate time range end_time = datetime.now() @@ -1203,9 +1404,13 @@ def generate_k003_top_queries_report(self, cluster: str = "local", node_name: st queries_by_db = {} for db_name in databases: + print(f"K003: Processing database {db_name}...") # Get pg_stat_statements metrics for this database query_metrics = self._get_pgss_metrics_data_by_db(cluster, node_name, db_name, start_time, end_time) + if not query_metrics: + print(f"Warning: K003 - No query metrics returned for database {db_name}") + # Sort by total_time (descending) and limit to top N per database sorted_metrics = sorted(query_metrics, key=lambda x: x.get('total_time', 0), reverse=True)[:limit] @@ -1228,13 +1433,18 @@ def generate_k003_top_queries_report(self, cluster: str = "local", node_name: st } } - return self.format_report_data("K003", queries_by_db, node_name) + return self.format_report_data( + "K003", + queries_by_db, + node_name, + postgres_version=self._get_postgres_version_info(cluster, node_name), + ) def _get_pgss_metrics_data(self, cluster: str, node_name: str, start_time: datetime, end_time: datetime) -> List[ Dict[str, Any]]: """ Get pg_stat_statements metrics data between two time points. - Adapted from the logic in flask-backend/app.py get_pgss_metrics_csv(). + Adapted from the logic in monitoring_flask_backend/app.py get_pgss_metrics_csv(). Args: cluster: Cluster name @@ -1341,7 +1551,7 @@ def _process_pgss_data(self, start_data: List[Dict], end_data: List[Dict], metric_mapping: Dict[str, str]) -> List[Dict[str, Any]]: """ Process pg_stat_statements data and calculate differences between start and end times. - Adapted from the logic in flask-backend/app.py process_pgss_data(). + Adapted from the logic in monitoring_flask_backend/app.py process_pgss_data(). """ # Convert Prometheus data to dictionaries start_metrics = self._prometheus_to_dict(start_data, start_time) @@ -1423,7 +1633,7 @@ def _process_pgss_data(self, start_data: List[Dict], end_data: List[Dict], def _prometheus_to_dict(self, prom_data: List[Dict], timestamp: datetime) -> Dict: """ Convert Prometheus API response to dictionary keyed by query identifiers. - Adapted from the logic in flask-backend/app.py prometheus_to_dict(). + Adapted from the logic in monitoring_flask_backend/app.py prometheus_to_dict(). """ if not prom_data: return {} @@ -1441,10 +1651,11 @@ def _prometheus_to_dict(self, prom_data: List[Dict], timestamp: datetime) -> Dic closest_value = min(values, key=lambda x: abs(float(x[0]) - timestamp.timestamp())) # Create unique key for this query + # Note: 'user' label may not exist in all metric configurations key = ( metric.get('datname', ''), metric.get('queryid', ''), - metric.get('user', ''), + metric.get('user', metric.get('tag_user', '')), # Fallback to tag_user or empty metric.get('instance', '') ) @@ -1485,36 +1696,132 @@ def format_bytes(self, bytes_value: float) -> str: else: return f"{value:.2f} {units[unit_index]}" - def format_report_data(self, check_id: str, data: Dict[str, Any], host: str = "target-database") -> Dict[str, Any]: + def format_report_data(self, check_id: str, data: Dict[str, Any], host: str = "target-database", + all_hosts: Dict[str, List[str]] = None, + postgres_version: Dict[str, str] = None) -> Dict[str, Any]: """ Format data to match template structure. Args: check_id: The check identifier - data: The data to format - host: Host identifier + data: The data to format (can be a dict with node keys if combining multiple nodes) + host: Primary host identifier (used if all_hosts not provided) + all_hosts: Optional dict with 'primary' and 'standbys' keys for multi-node reports + postgres_version: Optional Postgres version info to include at report level Returns: Dictionary formatted for templates """ - now = datetime.now() + now = datetime.now(timezone.utc) + + # If all_hosts is provided, use it; otherwise use the single host as primary + if all_hosts: + hosts = all_hosts + else: + hosts = { + "primary": host, + "standbys": [], + } + + # Handle both single-node and multi-node data structures + if isinstance(data, dict) and any(isinstance(v, dict) and 'data' in v for v in data.values()): + # Multi-node structure: data is already in {node_name: {"data": ...}} format + # postgres_version should already be embedded per-node; warn if passed here + if postgres_version: + print(f"Warning: postgres_version parameter ignored for multi-node data in {check_id}") + results = data + else: + # Single-node structure: wrap data in host key + node_result = {"data": data} + if postgres_version: + node_result["postgres_version"] = postgres_version + results = {host: node_result} template_data = { "checkId": check_id, + "checkTitle": self.get_check_title(check_id), "timestamptz": now.isoformat(), - "hosts": { - "master": host, - "replicas": [] - }, - "results": { - host: { - "data": data - } - } + "nodes": hosts, + "results": results } return template_data + def get_check_title(self, check_id: str) -> str: + """ + Get the human-readable title for a check ID. + + Args: + check_id: The check identifier (e.g., "H004") + + Returns: + Human-readable title for the check + """ + # Mapping based on postgres-checkup README + # https://gitlab.com/postgres-ai/postgres-checkup + check_titles = { + "A001": "System information", + "A002": "Postgres major version", + "A003": "Postgres settings", + "A004": "Cluster information", + "A005": "Extensions", + "A006": "Postgres setting deviations", + "A007": "Altered settings", + "A008": "Disk usage and file system type", + "A010": "Data checksums, wal_log_hints", + "A011": "Connection pooling. pgbouncer", + "A012": "Anti-crash checks", + "A013": "Postgres minor version", + "B001": "SLO/SLA, RPO, RTO", + "B002": "File system, mount flags", + "B003": "Full backups / incremental", + "B004": "WAL archiving", + "B005": "Restore checks, monitoring, alerting", + "C001": "SLO/SLA", + "C002": "Sync/async, Streaming / wal transfer; logical decoding", + "C003": "SPOFs; standby with traffic", + "C004": "Failover", + "C005": "Switchover", + "C006": "Delayed replica", + "C007": "Replication slots. Lags. Standby feedbacks", + "D001": "Logging settings", + "D002": "Useful Linux tools", + "D003": "List of monitoring metrics", + "D004": "pg_stat_statements and pg_stat_kcache settings", + "D005": "track_io_timing, auto_explain", + "D006": "Recommended DBA toolsets", + "D007": "Postgres-specific tools for troubleshooting", + "E001": "WAL/checkpoint settings, IO", + "E002": "Checkpoints, bgwriter, IO", + "F001": "Autovacuum: current settings", + "F002": "Autovacuum: transaction ID wraparound check", + "F003": "Autovacuum: dead tuples", + "F004": "Autovacuum: heap bloat (estimated)", + "F005": "Autovacuum: index bloat (estimated)", + "F006": "Precise heap bloat analysis", + "F007": "Precise index bloat analysis", + "F008": "Autovacuum: resource usage", + "G001": "Memory-related settings", + "G002": "Connections and current activity", + "G003": "Timeouts, locks, deadlocks", + "G004": "Query planner", + "G005": "I/O settings", + "G006": "Default_statistics_target", + "H001": "Invalid indexes", + "H002": "Unused indexes", + "H003": "Non-indexed foreign keys", + "H004": "Redundant indexes", + "J001": "Capacity planning", + "K001": "Globally aggregated query metrics", + "K002": "Workload type", + "K003": "Top-50 queries by total_time", + "L001": "Table sizes", + "L002": "Data types being used", + "L003": "Integer out-of-range risks in PKs", + "L004": "Tables without PK/UK", + } + return check_titles.get(check_id, f"Check {check_id}") + def get_setting_unit(self, setting_name: str) -> str: """Get the unit for a PostgreSQL setting.""" units = { @@ -1632,7 +1939,8 @@ def format_setting_value(self, setting_name: str, value: str, unit: str = "") -> return f"{val // 1024} MB" else: return f"{val} kB" - elif setting_name in ['autovacuum_analyze_scale_factor', 'autovacuum_vacuum_scale_factor']: + elif setting_name in ['autovacuum_analyze_scale_factor', 'autovacuum_vacuum_scale_factor', + 'autovacuum_vacuum_insert_scale_factor']: return f"{float(value) * 100:.1f}%" elif setting_name in ['autovacuum', 'track_activities', 'track_counts', 'track_functions', 'track_io_timing', 'track_wal_io_timing', 'pg_stat_statements.track_utility', @@ -1677,78 +1985,218 @@ def get_cluster_metric_description(self, metric_name: str) -> str: } return descriptions.get(metric_name, '') - def generate_all_reports(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: + def generate_all_reports(self, cluster: str = "local", node_name: str = None, combine_nodes: bool = True) -> Dict[str, Any]: """ Generate all reports. Args: cluster: Cluster name - node_name: Node name + node_name: Node name (if None and combine_nodes=True, will query all nodes) + combine_nodes: If True, combine primary and replica reports into single report Returns: Dictionary containing all reports """ reports = {} - # Generate each report - reports['A002'] = self.generate_a002_version_report(cluster, node_name) - reports['A003'] = self.generate_a003_settings_report(cluster, node_name) - reports['A004'] = self.generate_a004_cluster_report(cluster, node_name) - reports['A007'] = self.generate_a007_altered_settings_report(cluster, node_name) - reports['D004'] = self.generate_d004_pgstat_settings_report(cluster, node_name) - reports['F001'] = self.generate_f001_autovacuum_settings_report(cluster, node_name) - reports['F004'] = self.generate_f004_heap_bloat_report(cluster, node_name) - reports['F005'] = self.generate_f005_btree_bloat_report(cluster, node_name) - reports['G001'] = self.generate_g001_memory_settings_report(cluster, node_name) - reports['H001'] = self.generate_h001_invalid_indexes_report(cluster, node_name) - reports['H002'] = self.generate_h002_unused_indexes_report(cluster, node_name) - reports['H004'] = self.generate_h004_redundant_indexes_report(cluster, node_name) - reports['K001'] = self.generate_k001_query_calls_report(cluster, node_name) - reports['K003'] = self.generate_k003_top_queries_report(cluster, node_name) + # Determine which nodes to process + if combine_nodes and node_name is None: + # Get all nodes and combine them + all_nodes = self.get_all_nodes(cluster) + nodes_to_process = [] + if all_nodes["primary"]: + nodes_to_process.append(all_nodes["primary"]) + nodes_to_process.extend(all_nodes["standbys"]) + + # If no nodes found, fall back to default + if not nodes_to_process: + print(f"Warning: No nodes found in cluster '{cluster}', using default 'node-01'") + nodes_to_process = ["node-01"] + all_nodes = {"primary": "node-01", "standbys": []} + else: + print(f"Combining reports from nodes: {nodes_to_process}") + else: + # Use single node (backward compatibility) + if node_name is None: + node_name = "node-01" + nodes_to_process = [node_name] + all_nodes = {"primary": node_name, "standbys": []} + + # Generate each report type + report_types = [ + ('A002', self.generate_a002_version_report), + ('A003', self.generate_a003_settings_report), + ('A004', self.generate_a004_cluster_report), + ('A007', self.generate_a007_altered_settings_report), + ('D004', self.generate_d004_pgstat_settings_report), + ('F001', self.generate_f001_autovacuum_settings_report), + ('F004', self.generate_f004_heap_bloat_report), + ('F005', self.generate_f005_btree_bloat_report), + ('G001', self.generate_g001_memory_settings_report), + ('H001', self.generate_h001_invalid_indexes_report), + ('H002', self.generate_h002_unused_indexes_report), + ('H004', self.generate_h004_redundant_indexes_report), + ('K001', self.generate_k001_query_calls_report), + ('K003', self.generate_k003_top_queries_report), + ] + + for check_id, report_func in report_types: + if len(nodes_to_process) == 1: + # Single node - generate report normally + reports[check_id] = report_func(cluster, nodes_to_process[0]) + else: + # Multiple nodes - combine reports + combined_results = {} + for node in nodes_to_process: + print(f"Generating {check_id} report for node {node}...") + node_report = report_func(cluster, node) + # Extract the data from the node report + if 'results' in node_report and node in node_report['results']: + combined_results[node] = node_report['results'][node] + + # Create combined report with all nodes + reports[check_id] = self.format_report_data( + check_id, + combined_results, + all_nodes["primary"] if all_nodes["primary"] else nodes_to_process[0], + all_nodes + ) return reports - def get_all_databases(self, cluster: str = "local", node_name: str = "node-01") -> List[str]: + def get_all_clusters(self) -> List[str]: """ - Get all databases from the metrics. + Get all unique cluster names (projects) from the metrics. + + Returns: + List of cluster names + """ + # Query for all clusters using last_over_time to get recent values + clusters_query = 'last_over_time(pgwatch_settings_configured[3h])' + result = self.query_instant(clusters_query) + + cluster_set = set() + + if result.get('status') == 'success' and result.get('data', {}).get('result'): + for item in result['data']['result']: + cluster_name = item['metric'].get('cluster', '') + if cluster_name: + cluster_set.add(cluster_name) + else: + # Debug output + print(f"Debug - get_all_clusters query status: {result.get('status')}") + print(f"Debug - get_all_clusters result count: {len(result.get('data', {}).get('result', []))}") + + if cluster_set: + print(f"Found {len(cluster_set)} cluster(s): {sorted(list(cluster_set))}") + + return sorted(list(cluster_set)) + + def get_all_nodes(self, cluster: str = "local") -> Dict[str, List[str]]: + """ + Get all nodes (primary and replicas) from the metrics. + Uses pgwatch_db_stats_in_recovery_int to determine primary vs standby. Args: cluster: Cluster name - node_name: Node name Returns: - List of database names + Dictionary with 'primary' and 'standbys' keys containing node names """ - # Try to get databases from metrics that use 'dbname' label (custom metrics) - db_query = f'last_over_time(pgwatch_unused_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}"}}[10h])' - result = self.query_instant(db_query) + # Query for all nodes in the cluster using last_over_time + nodes_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}"}}[3h])' + result = self.query_instant(nodes_query) + + nodes = {"primary": None, "standbys": []} + node_set = set() - databases = [] if result.get('status') == 'success' and result.get('data', {}).get('result'): for item in result['data']['result']: - db_name = item['metric'].get('dbname', '') - if db_name and db_name not in databases: - databases.append(db_name) + node_name = item['metric'].get('node_name', '') + if node_name and node_name not in node_set: + node_set.add(node_name) - # If no databases found using dbname, try using datname (catalog metrics) - if not databases: - db_query = f'pgwatch_pg_database_wraparound_age_datfrozenxid{{cluster="{cluster}", node_name="{node_name}", datname!="template1"}}' - result = self.query_instant(db_query) - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - db_name = item['metric'].get('datname', '') - if db_name and db_name not in databases: - databases.append(db_name) + # Convert to sorted list + node_list = sorted(list(node_set)) - # If still no databases found, try another alternative query - if not databases: - db_query = f'pgwatch_pg_database_size_bytes{{cluster="{cluster}", node_name="{node_name}"}}' - result = self.query_instant(db_query) - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - db_name = item['metric'].get('datname', '') - if db_name and db_name not in databases: - databases.append(db_name) + if node_list: + print(f" Found {len(node_list)} node(s) in cluster '{cluster}': {node_list}") + else: + print(f" Warning: No nodes found in cluster '{cluster}'") + + # Use pgwatch_db_stats_in_recovery_int to determine primary vs standby + # in_recovery = 0 means primary, in_recovery = 1 means standby + for node_name in node_list: + recovery_query = f'last_over_time(pgwatch_db_stats_in_recovery_int{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + recovery_result = self.query_instant(recovery_query) + + is_standby = False + if recovery_result.get('status') == 'success' and recovery_result.get('data', {}).get('result'): + if recovery_result['data']['result']: + in_recovery_value = float(recovery_result['data']['result'][0]['value'][1]) + is_standby = (in_recovery_value > 0) + print(f" Node '{node_name}': in_recovery={int(in_recovery_value)} ({'standby' if is_standby else 'primary'})") + + if is_standby: + nodes["standbys"].append(node_name) + else: + # First non-standby node becomes primary + if nodes["primary"] is None: + nodes["primary"] = node_name + else: + # If we have multiple primaries (shouldn't happen), treat as replicas + print(f" Warning: Multiple primary nodes detected, treating '{node_name}' as replica") + nodes["standbys"].append(node_name) + + print(f" Result: primary={nodes['primary']}, replicas={nodes['standbys']}") + return nodes + + def get_all_databases(self, cluster: str = "local", node_name: str = "node-01") -> List[str]: + """ + Get all databases from the metrics. + + Args: + cluster: Cluster name + node_name: Node name + + Returns: + List of database names + """ + # Build a source-agnostic database list by unifying labels from: + # 1) Generic per-database metric (wraparound) β†’ datname + # 2) Custom index reports (unused/redundant) β†’ dbname + # 3) Btree bloat (for completeness) β†’ datname + databases: List[str] = [] + database_set = set() + + # Helper to add a name safely + def add_db(name: str) -> None: + if name and name not in self.excluded_databases and name not in database_set: + database_set.add(name) + databases.append(name) + + # 1) Generic per-database metric + wrap_q = f'last_over_time(pgwatch_pg_database_wraparound_age_datfrozenxid{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + wrap_res = self.query_instant(wrap_q) + if wrap_res.get('status') == 'success' and wrap_res.get('data', {}).get('result'): + for item in wrap_res['data']['result']: + add_db(item["metric"].get("datname", "")) + + # 2) Custom reports using dbname + unused_q = f'last_over_time(pgwatch_unused_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + redun_q = f'last_over_time(pgwatch_redundant_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + for q in (unused_q, redun_q): + res = self.query_instant(q) + if res.get('status') == 'success' and res.get('data', {}).get('result'): + for item in res['data']['result']: + add_db(item["metric"].get("dbname", "")) + + # 3) Btree bloat family + bloat_q = f'last_over_time(pgwatch_pg_btree_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}"}}[3h])' + bloat_res = self.query_instant(bloat_q) + if bloat_res.get('status') == 'success' and bloat_res.get('data', {}).get('result'): + for item in bloat_res['data']['result']: + add_db(item["metric"].get("datname", "")) return databases @@ -1800,6 +2248,8 @@ def _get_pgss_metrics_data_by_db(self, cluster: str, node_name: str, db_name: st # Get metrics at start and end times start_data = [] end_data = [] + + metrics_found = 0 for metric in all_metrics: metric_with_filters = f'{metric}{filter_str}' @@ -1810,6 +2260,7 @@ def _get_pgss_metrics_data_by_db(self, cluster: str, node_name: str, db_name: st start_time + timedelta(minutes=1)) if start_result: start_data.extend(start_result) + metrics_found += 1 # Query metrics around end time end_result = self.query_range(metric_with_filters, end_time - timedelta(minutes=1), @@ -1820,14 +2271,35 @@ def _get_pgss_metrics_data_by_db(self, cluster: str, node_name: str, db_name: st except Exception as e: print(f"Warning: Failed to query metric {metric} for database {db_name}: {e}") continue + + if metrics_found == 0: + print(f"Warning: No pg_stat_statements metrics found for database {db_name}") + print(f" Checked time range: {start_time.isoformat()} to {end_time.isoformat()}") # Process the data to calculate differences - return self._process_pgss_data(start_data, end_data, start_time, end_time, METRIC_NAME_MAPPING) + result = self._process_pgss_data(start_data, end_data, start_time, end_time, METRIC_NAME_MAPPING) + + if not result: + print(f"Warning: _process_pgss_data returned empty result for database {db_name}") + + return result - def create_report(self, api_url, token, project, epoch): + def create_report(self, api_url, token, project_name, epoch): + """ + Create a new report in the API. + + Args: + api_url: API URL + token: API token + project_name: Project name (cluster identifier) + epoch: Epoch identifier + + Returns: + Report ID + """ request_data = { "access_token": token, - "project": project, + "project": project_name, "epoch": epoch, } @@ -1854,6 +2326,7 @@ def upload_report_file(self, api_url, token, report_id, path): "filename": file_name, "data": data, "type": file_type, + "generate_issue": True } response = make_request(api_url, "/rpc/checkup_report_file_post", request_data) @@ -1869,14 +2342,17 @@ def make_request(api_url, endpoint, request_data): def main(): parser = argparse.ArgumentParser(description='Generate PostgreSQL reports using PromQL') + parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}') parser.add_argument('--prometheus-url', default='http://sink-prometheus:9090', help='Prometheus URL (default: http://sink-prometheus:9090)') parser.add_argument('--postgres-sink-url', default='postgresql://pgwatch@sink-postgres:5432/measurements', help='Postgres sink connection string (default: postgresql://pgwatch@sink-postgres:5432/measurements)') - parser.add_argument('--cluster', default='local', - help='Cluster name (default: local)') - parser.add_argument('--node-name', default='node-01', - help='Node name (default: node-01)') + parser.add_argument('--cluster', default=None, + help='Cluster name (default: auto-detect all clusters)') + parser.add_argument('--node-name', default=None, + help='Node name (default: auto-detect all nodes when combine-nodes is true)') + parser.add_argument('--no-combine-nodes', action='store_true', default=False, + help='Disable combining primary and replica reports into single report') parser.add_argument('--check-id', choices=['A002', 'A003', 'A004', 'A007', 'D004', 'F001', 'F004', 'F005', 'G001', 'H001', 'H002', 'H004', 'K001', 'K003', 'ALL'], @@ -1885,14 +2361,23 @@ def main(): help='Output file (default: stdout)') parser.add_argument('--api-url', default='https://postgres.ai/api/general') parser.add_argument('--token', default='') - parser.add_argument('--project', default='project-name') + parser.add_argument('--project-name', default='project-name', + help='Project name for API upload (default: project-name)') parser.add_argument('--epoch', default='1') parser.add_argument('--no-upload', action='store_true', default=False, help='Do not upload reports to the API') + parser.add_argument('--exclude-databases', type=str, default=None, + help='Comma-separated list of additional databases to exclude from reports ' + f'(default exclusions: {", ".join(sorted(PostgresReportGenerator.DEFAULT_EXCLUDED_DATABASES))})') args = parser.parse_args() + + # Parse excluded databases + excluded_databases = None + if args.exclude_databases: + excluded_databases = [db.strip() for db in args.exclude_databases.split(',')] - generator = PostgresReportGenerator(args.prometheus_url, args.postgres_sink_url) + generator = PostgresReportGenerator(args.prometheus_url, args.postgres_sink_url, excluded_databases) # Test connection if not generator.test_connection(): @@ -1900,61 +2385,106 @@ def main(): sys.exit(1) try: - if args.check_id == 'ALL' or args.check_id is None: - # Generate all reports - if not args.no_upload: - report_id = generator.create_report(args.api_url, args.token, args.project, args.epoch) - reports = generator.generate_all_reports(args.cluster, args.node_name) - for report in reports: - json.dump(reports[report], open(f"{report}.json", "w")) - if not args.no_upload: - generator.upload_report_file(args.api_url, args.token, report_id, f"{report}.json") - if args.output == '-': - pass - else: - with open(args.output, 'w') as f: - json.dump(reports, f, indent=2) - print(f"All reports written to {args.output}") + # Discover all clusters if not specified + clusters_to_process = [] + if args.cluster: + clusters_to_process = [args.cluster] else: - # Generate specific report - if args.check_id == 'A002': - report = generator.generate_a002_version_report(args.cluster, args.node_name) - elif args.check_id == 'A003': - report = generator.generate_a003_settings_report(args.cluster, args.node_name) - elif args.check_id == 'A004': - report = generator.generate_a004_cluster_report(args.cluster, args.node_name) - elif args.check_id == 'A007': - report = generator.generate_a007_altered_settings_report(args.cluster, args.node_name) - elif args.check_id == 'D004': - report = generator.generate_d004_pgstat_settings_report(args.cluster, args.node_name) - elif args.check_id == 'F001': - report = generator.generate_f001_autovacuum_settings_report(args.cluster, args.node_name) - elif args.check_id == 'F004': - report = generator.generate_f004_heap_bloat_report(args.cluster, args.node_name) - elif args.check_id == 'F005': - report = generator.generate_f005_btree_bloat_report(args.cluster, args.node_name) - elif args.check_id == 'G001': - report = generator.generate_g001_memory_settings_report(args.cluster, args.node_name) - elif args.check_id == 'G003': - report = generator.generate_g003_database_stats_report(args.cluster, args.node_name) - elif args.check_id == 'H001': - report = generator.generate_h001_invalid_indexes_report(args.cluster, args.node_name) - elif args.check_id == 'H002': - report = generator.generate_h002_unused_indexes_report(args.cluster, args.node_name) - elif args.check_id == 'H004': - report = generator.generate_h004_redundant_indexes_report(args.cluster, args.node_name) - elif args.check_id == 'K001': - report = generator.generate_k001_query_calls_report(args.cluster, args.node_name) - elif args.check_id == 'K003': - report = generator.generate_k003_top_queries_report(args.cluster, args.node_name) - - if args.output == '-': - print(json.dumps(report, indent=2)) + clusters_to_process = generator.get_all_clusters() + if not clusters_to_process: + print("Warning: No clusters found, using default 'local'") + clusters_to_process = ['local'] else: - with open(args.output, 'w') as f: - json.dump(report, f, indent=2) + print(f"Discovered clusters: {clusters_to_process}") + + # Process each cluster + for cluster in clusters_to_process: + print(f"\n{'='*60}") + print(f"Processing cluster: {cluster}") + print(f"{'='*60}\n") + + # Set default node_name if not provided and not combining nodes + combine_nodes = not args.no_combine_nodes + if args.node_name is None and not combine_nodes: + args.node_name = "node-01" + + if args.check_id == 'ALL' or args.check_id is None: + # Generate all reports for this cluster if not args.no_upload: - generator.upload_report_file(args.api_url, args.token, args.project, args.epoch, args.output) + # Use cluster name as project name if not specified + project_name = args.project_name if args.project_name != 'project-name' else cluster + report_id = generator.create_report(args.api_url, args.token, project_name, args.epoch) + + reports = generator.generate_all_reports(cluster, args.node_name, combine_nodes) + + # Save reports with cluster name prefix + for report in reports: + output_filename = f"{cluster}_{report}.json" if len(clusters_to_process) > 1 else f"{report}.json" + with open(output_filename, "w") as f: + json.dump(reports[report], f, indent=2) + print(f"Generated report: {output_filename}") + if not args.no_upload: + generator.upload_report_file(args.api_url, args.token, report_id, output_filename) + + if args.output == '-': + pass + elif len(clusters_to_process) == 1: + # Single cluster - use specified output + with open(args.output, 'w') as f: + json.dump(reports, f, indent=2) + print(f"All reports written to {args.output}") + else: + # Multiple clusters - create combined output + combined_output = f"{cluster}_all_reports.json" + with open(combined_output, 'w') as f: + json.dump(reports, f, indent=2) + print(f"All reports for cluster {cluster} written to {combined_output}") + else: + # Generate specific report - use node_name or default + if args.node_name is None: + args.node_name = "node-01" + + if args.check_id == 'A002': + report = generator.generate_a002_version_report(cluster, args.node_name) + elif args.check_id == 'A003': + report = generator.generate_a003_settings_report(cluster, args.node_name) + elif args.check_id == 'A004': + report = generator.generate_a004_cluster_report(cluster, args.node_name) + elif args.check_id == 'A007': + report = generator.generate_a007_altered_settings_report(cluster, args.node_name) + elif args.check_id == 'D004': + report = generator.generate_d004_pgstat_settings_report(cluster, args.node_name) + elif args.check_id == 'F001': + report = generator.generate_f001_autovacuum_settings_report(cluster, args.node_name) + elif args.check_id == 'F004': + report = generator.generate_f004_heap_bloat_report(cluster, args.node_name) + elif args.check_id == 'F005': + report = generator.generate_f005_btree_bloat_report(cluster, args.node_name) + elif args.check_id == 'G001': + report = generator.generate_g001_memory_settings_report(cluster, args.node_name) + elif args.check_id == 'H001': + report = generator.generate_h001_invalid_indexes_report(cluster, args.node_name) + elif args.check_id == 'H002': + report = generator.generate_h002_unused_indexes_report(cluster, args.node_name) + elif args.check_id == 'H004': + report = generator.generate_h004_redundant_indexes_report(cluster, args.node_name) + elif args.check_id == 'K001': + report = generator.generate_k001_query_calls_report(cluster, args.node_name) + elif args.check_id == 'K003': + report = generator.generate_k003_top_queries_report(cluster, args.node_name) + + output_filename = f"{cluster}_{args.check_id}.json" if len(clusters_to_process) > 1 else args.output + + if args.output == '-' and len(clusters_to_process) == 1: + print(json.dumps(report, indent=2)) + else: + with open(output_filename, 'w') as f: + json.dump(report, f, indent=2) + print(f"Report written to {output_filename}") + if not args.no_upload: + project_name = args.project_name if args.project_name != 'project-name' else cluster + report_id = generator.create_report(args.api_url, args.token, project_name, args.epoch) + generator.upload_report_file(args.api_url, args.token, report_id, output_filename) except Exception as e: print(f"Error generating reports: {e}") raise e diff --git a/reporter/requirements-dev.txt b/reporter/requirements-dev.txt new file mode 100644 index 0000000..7499b74 --- /dev/null +++ b/reporter/requirements-dev.txt @@ -0,0 +1,5 @@ +-r requirements.txt +pytest==9.0.1 +pytest-postgresql==7.0.2 +coverage==7.6.10 +pytest-cov==6.0.0 diff --git a/reporter/requirements.txt b/reporter/requirements.txt index 6813242..9a4b410 100644 --- a/reporter/requirements.txt +++ b/reporter/requirements.txt @@ -1,2 +1,2 @@ -requests>=2.31.0 -psycopg2-binary>=2.9.9 \ No newline at end of file +requests==2.32.5 +psycopg2-binary==2.9.11 \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/lock_waits/README.md b/tests/lock_waits/README.md new file mode 100644 index 0000000..3838b5c --- /dev/null +++ b/tests/lock_waits/README.md @@ -0,0 +1,226 @@ +# Lock Waits Metric Testing + +This directory contains tests and scripts to verify that the `lock_waits` metric is working correctly. + +## Overview + +The `lock_waits` metric collects detailed information about lock waits in PostgreSQL, including: +- Waiting and blocking process IDs +- User names and application names +- Lock modes and types +- Affected tables +- Query IDs (PostgreSQL 14+) +- Wait durations and blocker transaction durations + +## Test Components + +### 1. Python Test Script (`test_lock_waits_metric.py`) + +Automated test that: +- Creates lock contention scenarios in the target database +- Waits for pgwatch to collect metrics +- Verifies the metric is collected in Prometheus/VictoriaMetrics +- Validates the metric structure and labels + +### 2. SQL Script (`create_lock_contention.sql`) + +Manual SQL script to create lock contention for testing. Can be run in multiple psql sessions. + +## Prerequisites + +1. Docker Compose stack running: + ```bash + docker-compose up -d + ``` + +2. Python dependencies: + ```bash + pip install psycopg requests + ``` + +3. Ensure `lock_waits` metric is enabled in pgwatch configuration: + - Check `config/pgwatch-prometheus/metrics.yml` includes `lock_waits` + - Verify pgwatch is collecting metrics from the target database + +## Running the Automated Test + +### Basic Usage + +```bash +# From the project root +python tests/lock_waits/test_lock_waits_metric.py +``` + +### With Custom Configuration + +```bash +python tests/lock_waits/test_lock_waits_metric.py \ + --target-db-url "postgresql://postgres:postgres@localhost:55432/target_database" \ + --prometheus-url "http://localhost:59090" \ + --test-dbname "target_database" \ + --collection-wait 90 +``` + +### Environment Variables + +You can also set these via environment variables: + +```bash +export TARGET_DB_URL="postgresql://postgres:postgres@localhost:55432/target_database" +export PROMETHEUS_URL="http://localhost:59090" +export TEST_DBNAME="target_database" +export COLLECTION_WAIT_SECONDS=90 + +python tests/lock_waits/test_lock_waits_metric.py +``` + +## Manual Testing + +### Step 1: Create Lock Contention + +Open two psql sessions to the target database: + +**Session 1 (Blocker):** +```sql +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +-- Keep this transaction open +``` + +**Session 2 (Waiter):** +```sql +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +-- This will wait for Session 1 to release the lock +``` + +### Step 2: Verify Metric Collection + +Wait for pgwatch to collect metrics (check collection interval in pgwatch config, typically 15-30 seconds), then query Prometheus: + +```bash +# Query Prometheus API for lock_waits metrics +curl "http://localhost:59090/api/v1/query?query=pgwatch_lock_waits_waiting_ms{datname=\"target_database\"}" + +# Or use PromQL in Grafana Explore +pgwatch_lock_waits_waiting_ms{datname="target_database"} +pgwatch_lock_waits_blocker_tx_ms{datname="target_database"} +``` + +### Step 3: Check Grafana Dashboard + +1. Open Grafana: http://localhost:3000 +2. Navigate to "Lock waits details" dashboard +3. Select the database from the dropdown +4. Verify that lock wait events appear in the panels + +## Expected Results + +### Successful Test Output + +``` +Setting up test environment... +βœ“ Test table created + +Creating lock contention for 30 seconds... +βœ“ Blocker transaction started (holding lock on row id=1) +βœ“ Waiter transaction started (waiting for lock on row id=1) + Holding locks for 30 seconds... +βœ“ Lock contention ended + +Verifying metric collection... + Waiting 60 seconds for pgwatch to collect metrics... + βœ“ Found 5 lock_waits records + +Validating metric structure... + + Record 1: + βœ“ All required data fields present + βœ“ waiting_ms is numeric: 25000 ms + βœ“ blocker_tx_ms is numeric: 30000 ms + +βœ… Test PASSED: lock_waits metric is working correctly +``` + +## Troubleshooting + +### No Records Found + +- **Check pgwatch is running**: `docker ps | grep pgwatch-prometheus` +- **Check pgwatch logs**: `docker logs pgwatch-prometheus` +- **Verify metric is enabled**: Check `config/pgwatch-prometheus/metrics.yml` +- **Check Prometheus is accessible**: `curl http://localhost:59090/api/v1/status/config` +- **Increase wait time**: Use `--collection-wait 120` to wait longer +- **Check database name**: Ensure `--test-dbname` matches the monitored database +- **Verify metrics exist**: `curl "http://localhost:59090/api/v1/label/__name__/values" | grep lock_waits` + +### Invalid Data Structure + +- **Check PostgreSQL version**: Metric requires PostgreSQL 14+ for query_id support +- **Verify metric SQL**: Check the SQL query in `metrics.yml` is correct +- **Check pgwatch version**: Ensure pgwatch version supports the metric format +- **Check Prometheus labels**: Verify metrics have expected labels (datname, waiting_pid, blocker_pid, etc.) + +### Connection Errors + +- **Verify Docker containers**: `docker-compose ps` +- **Check connection strings**: Verify URLs match your docker-compose configuration +- **Check Prometheus URL**: Ensure Prometheus/VictoriaMetrics is accessible at the specified URL +- **Check network**: Ensure containers can communicate (same Docker network) + +## Integration with CI/CD + +The test can be integrated into CI/CD pipelines: + +```yaml +# Example GitLab CI +test_lock_waits: + stage: test + script: + - docker-compose up -d + - sleep 30 # Wait for services to start + - pip install psycopg + - python tests/lock_waits/test_lock_waits_metric.py + --target-db-url "$TARGET_DB_URL" + --sink-db-url "$SINK_DB_URL" + --collection-wait 90 + only: + - merge_requests + - main +``` + +## Additional Test Scenarios + +### Test Different Lock Types + +Modify the test to create different types of locks: + +```sql +-- Table-level lock +LOCK TABLE lock_test_table IN EXCLUSIVE MODE; + +-- Advisory lock +SELECT pg_advisory_lock(12345); +``` + +### Test Multiple Concurrent Waits + +Create multiple waiting transactions to test the LIMIT clause: + +```sql +-- Session 1: Blocker +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; + +-- Sessions 2-10: Multiple waiters +-- Each in separate psql session +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +``` + +## Related Files + +- `config/pgwatch-prometheus/metrics.yml` - Metric definition +- `config/grafana/dashboards/Dashboard_13_Lock_waits.json` - Grafana dashboard +- `workload_examples/lock_wait_test.sql` - Basic lock test SQL + diff --git a/tests/lock_waits/__init__.py b/tests/lock_waits/__init__.py new file mode 100644 index 0000000..228403c --- /dev/null +++ b/tests/lock_waits/__init__.py @@ -0,0 +1,2 @@ +# Lock waits metric testing package + diff --git a/tests/lock_waits/create_lock_contention.sql b/tests/lock_waits/create_lock_contention.sql new file mode 100644 index 0000000..5e5da7a --- /dev/null +++ b/tests/lock_waits/create_lock_contention.sql @@ -0,0 +1,73 @@ +-- SQL script to manually create lock contention for testing lock_waits metric +-- +-- Usage: +-- 1. Run this script in Session 1 (blocker) +-- 2. Run the same script in Session 2 (waiter) - it will wait +-- 3. Check the sink database for lock_waits records +-- 4. Commit or rollback Session 1 to release the lock + +-- Create test table if it doesn't exist +drop table if exists lock_test_table cascade; +create table lock_test_table ( + id int8 generated always as identity primary key, + name text not null, + value numeric(10, 2), + created_at timestamptz default now() +); + +insert into lock_test_table (name, value) +values + ('Item 1', 100.50), + ('Item 2', 200.75), + ('Item 3', 300.25); + +-- ============================================ +-- SESSION 1 (BLOCKER) - Run this first +-- ============================================ +begin; + +-- Acquire exclusive lock on row id=1 +-- Keep this transaction open to hold the lock +select * from lock_test_table where id = 1 for update; + +-- Transaction is now holding the lock +-- DO NOT COMMIT YET - keep this session open + +-- ============================================ +-- SESSION 2 (WAITER) - Run this in another psql session +-- ============================================ +begin; + +-- This will wait for Session 1 to release the lock +select * from lock_test_table where id = 1 for update; + +-- This query will block until Session 1 commits or rolls back +-- You should see it waiting in pg_stat_activity + +-- ============================================ +-- To release the lock, commit or rollback Session 1: +-- ============================================ +-- commit; -- or rollback; + +-- ============================================ +-- Alternative: Test with different lock types +-- ============================================ + +-- Test with table-level lock +-- SESSION 1: +-- begin; +-- lock table lock_test_table in exclusive mode; + +-- SESSION 2: +-- begin; +-- select * from lock_test_table; -- Will wait + +-- Test with advisory lock +-- SESSION 1: +-- begin; +-- select pg_advisory_lock(12345); + +-- SESSION 2: +-- begin; +-- select pg_advisory_lock(12345); -- Will wait + diff --git a/tests/lock_waits/run_test.sh b/tests/lock_waits/run_test.sh new file mode 100755 index 0000000..de45803 --- /dev/null +++ b/tests/lock_waits/run_test.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Simple wrapper script to run the lock_waits metric test + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Default values (can be overridden by environment variables) +TARGET_DB_URL="${TARGET_DB_URL:-postgresql://postgres:postgres@localhost:55432/target_database}" +PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:59090}" +TEST_DBNAME="${TEST_DBNAME:-target_database}" +COLLECTION_WAIT="${COLLECTION_WAIT_SECONDS:-60}" + +echo "==========================================" +echo "Lock Waits Metric Test" +echo "==========================================" +echo "" +echo "Configuration:" +echo " Target DB: $TARGET_DB_URL" +echo " Prometheus URL: $PROMETHEUS_URL" +echo " Test DB Name: $TEST_DBNAME" +echo " Collection Wait: ${COLLECTION_WAIT}s" +echo "" + +# Check if required packages are installed +if ! python3 -c "import psycopg" 2>/dev/null; then + echo "Installing psycopg..." + pip3 install psycopg +fi + +if ! python3 -c "import requests" 2>/dev/null; then + echo "Installing requests..." + pip3 install requests +fi + +# Run the test +cd "$PROJECT_ROOT" +python3 tests/lock_waits/test_lock_waits_metric.py \ + --target-db-url "$TARGET_DB_URL" \ + --prometheus-url "$PROMETHEUS_URL" \ + --test-dbname "$TEST_DBNAME" \ + --collection-wait "$COLLECTION_WAIT" + diff --git a/tests/lock_waits/test_lock_waits_metric.py b/tests/lock_waits/test_lock_waits_metric.py new file mode 100644 index 0000000..b4bbaca --- /dev/null +++ b/tests/lock_waits/test_lock_waits_metric.py @@ -0,0 +1,426 @@ +""" +Test script to verify lock_waits metric collection. + +This script: +1. Creates lock contention scenarios in the target database +2. Waits for pgwatch to collect metrics +3. Verifies the lock_waits metric is collected in Prometheus +4. Validates the data structure and content +""" + +import json +import os +import threading +import time +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional + +import psycopg +import requests + + +class LockWaitsTest: + def __init__( + self, + target_db_url: str, + prometheus_url: str, + test_dbname: str = "target_database", + collection_wait_seconds: int = 60, + ): + """ + Initialize the test. + + Args: + target_db_url: Connection string for the target database being monitored + prometheus_url: URL for Prometheus/VictoriaMetrics API + test_dbname: Name of the database being monitored + collection_wait_seconds: How long to wait for pgwatch to collect metrics + """ + self.target_db_url = target_db_url + self.prometheus_url = prometheus_url.rstrip("/") + self.test_dbname = test_dbname + self.collection_wait_seconds = collection_wait_seconds + self.target_conn: Optional[psycopg.Connection] = None + self.blocker_conn: Optional[psycopg.Connection] = None + + def setup(self): + """Set up database connections and test table.""" + print("Setting up test environment...") + + # Connect to target database + self.target_conn = psycopg.connect(self.target_db_url) + self.target_conn.autocommit = True + + # Verify Prometheus is accessible + try: + response = requests.get(f"{self.prometheus_url}/api/v1/status/config", timeout=5) + response.raise_for_status() + print("βœ“ Prometheus connection verified") + except Exception as e: + print(f"⚠ Warning: Could not verify Prometheus connection: {e}") + + # Create test table + with self.target_conn.cursor() as cur: + cur.execute( + """ + drop table if exists lock_test_table cascade; + create table lock_test_table ( + id int8 generated always as identity primary key, + name text not null, + value numeric(10, 2), + created_at timestamptz default now() + ); + insert into lock_test_table (name, value) + values + ('Item 1', 100.50), + ('Item 2', 200.75), + ('Item 3', 300.25); + """ + ) + print("βœ“ Test table created") + + def create_lock_contention(self, duration_seconds: int = 30): + """ + Create lock contention by: + 1. Starting a transaction that locks a row + 2. Starting another transaction that tries to lock the same row (will wait) + 3. Keeping both transactions open for the specified duration + """ + print(f"\nCreating lock contention for {duration_seconds} seconds...") + + # Connection 1: Blocker - acquires lock and holds it + self.blocker_conn = psycopg.connect(self.target_db_url) + self.blocker_conn.autocommit = False + blocker_cur = self.blocker_conn.cursor() + blocker_cur.execute("begin") + blocker_cur.execute( + "select * from lock_test_table where id = 1 for update" + ) + blocker_cur.fetchone() + print("βœ“ Blocker transaction started (holding lock on row id=1)") + + # Small delay to ensure blocker has the lock + time.sleep(1) + + # Connection 2: Waiter - tries to acquire same lock (will wait) + waiter_conn = psycopg.connect(self.target_db_url) + waiter_conn.autocommit = False + waiter_cur = waiter_conn.cursor() + waiter_cur.execute("begin") + print("βœ“ Waiter transaction started (waiting for lock on row id=1)") + + # Execute the waiting query in a separate thread so it can block + waiter_error = [] + waiter_done = threading.Event() + + def run_waiter(): + try: + # This will block until blocker releases the lock + waiter_cur.execute( + "select * from lock_test_table where id = 1 for update" + ) + waiter_cur.fetchone() + print(" βœ“ Waiter acquired lock (blocker released)") + except Exception as e: + waiter_error.append(str(e)) + print(f" Waiter error: {e}") + finally: + waiter_done.set() + + waiter_thread = threading.Thread(target=run_waiter, daemon=True) + waiter_thread.start() + + # Give waiter time to start waiting + time.sleep(2) + + # Verify waiter is actually waiting + with self.target_conn.cursor() as check_cur: + check_cur.execute( + """ + select pid, state, wait_event_type, wait_event + from pg_stat_activity + where datname = current_database() + and pid <> pg_backend_pid() + and wait_event_type = 'Lock' + """ + ) + waiting_pids = check_cur.fetchall() + if waiting_pids: + print(f" βœ“ Confirmed {len(waiting_pids)} process(es) waiting for locks") + for pid, state, wait_type, wait_event in waiting_pids: + print(f" PID {pid}: state={state}, wait_event={wait_event}") + else: + print(" ⚠ No processes found waiting for locks") + + # Keep locks held for the duration + print(f" Holding locks for {duration_seconds} seconds...") + time.sleep(duration_seconds) + + # Cleanup: commit blocker first, then waiter + print(" Releasing blocker lock...") + blocker_cur.execute("commit") + blocker_cur.close() + self.blocker_conn.close() + self.blocker_conn = None + + # Wait for waiter to complete + waiter_done.wait(timeout=5) + try: + waiter_cur.execute("commit") + except Exception: + pass + waiter_cur.close() + waiter_conn.close() + + print("βœ“ Lock contention ended") + + def verify_metric_collected(self) -> List[Dict]: + """ + Verify that lock_waits metric was collected in Prometheus. + + Returns: + List of lock_waits metric samples found + """ + print("\nVerifying metric collection...") + + # Wait for pgwatch to collect metrics + print(f" Waiting {self.collection_wait_seconds} seconds for pgwatch to collect metrics...") + time.sleep(self.collection_wait_seconds) + + # Query Prometheus for lock_waits metrics + # pgwatch exports metrics with prefix pgwatch__ + metrics_to_check = [ + "pgwatch_lock_waits_waiting_ms", + "pgwatch_lock_waits_blocker_tx_ms", + ] + + records = [] + cutoff_time = datetime.now(timezone.utc) - timedelta(minutes=5) + + for metric_name in metrics_to_check: + try: + # Query for recent samples + query = f'{metric_name}{{datname="{self.test_dbname}"}}' + response = requests.get( + f"{self.prometheus_url}/api/v1/query", + params={ + "query": query, + "time": datetime.now(timezone.utc).timestamp(), + }, + timeout=10, + ) + response.raise_for_status() + data = response.json() + + if data.get("status") == "success" and data.get("data", {}).get("result"): + for result in data["data"]["result"]: + metric = result.get("metric", {}) + value = result.get("value", [None, None]) + + # Convert timestamp + timestamp = float(value[0]) if value[0] else None + if timestamp: + metric_time = datetime.fromtimestamp(timestamp, tz=timezone.utc) + if metric_time >= cutoff_time: + records.append( + { + "time": metric_time, + "metric": metric_name, + "labels": metric, + "value": float(value[1]) if value[1] else None, + } + ) + except Exception as e: + print(f" ⚠ Error querying {metric_name}: {e}") + + print(f" βœ“ Found {len(records)} lock_waits metric samples") + + return records + + def validate_metric_structure(self, records: List[Dict]) -> bool: + """ + Validate that the metric records have the expected structure. + + Args: + records: List of metric samples to validate + + Returns: + True if validation passes, False otherwise + """ + if not records: + print(" ⚠ No records to validate") + return False + + print("\nValidating metric structure...") + + # Expected labels in Prometheus metrics + expected_labels = [ + "datname", + "waiting_user", + "waiting_appname", + "waiting_table", + "waiting_query_id", + "waiting_mode", + "waiting_locktype", + "waiting_pid", + "blocker_user", + "blocker_appname", + "blocker_table", + "blocker_query_id", + "blocker_mode", + "blocker_locktype", + "blocker_pid", + ] + + all_valid = True + unique_samples = {} + + # Group samples by their label combination + for record in records: + labels = record.get("labels", {}) + # Create a key from relevant labels + key = ( + labels.get("waiting_pid"), + labels.get("blocker_pid"), + labels.get("waiting_table"), + ) + if key not in unique_samples: + unique_samples[key] = record + + print(f" Found {len(unique_samples)} unique lock wait samples") + + for i, (key, record) in enumerate(list(unique_samples.items())[:5]): # Validate first 5 + print(f"\n Sample {i+1}:") + labels = record.get("labels", {}) + metric_name = record.get("metric", "") + value = record.get("value") + + # Check datname matches + if labels.get("datname") != self.test_dbname: + print(f" ⚠ datname mismatch: {labels.get('datname')} != {self.test_dbname}") + else: + print(f" βœ“ datname matches: {labels.get('datname')}") + + # Check key labels are present + key_labels = ["waiting_pid", "blocker_pid", "waiting_mode", "blocker_mode"] + missing_labels = [label for label in key_labels if not labels.get(label)] + if missing_labels: + print(f" ⚠ Missing key labels: {missing_labels}") + else: + print(f" βœ“ Key labels present") + + # Validate metric value + if value is not None: + try: + float(value) + print(f" βœ“ Metric value is numeric: {value}") + if "waiting_ms" in metric_name or "blocker_tx_ms" in metric_name: + print(f" Value: {value} ms") + except (ValueError, TypeError): + print(f" βœ— Metric value is not numeric: {value}") + all_valid = False + else: + print(f" ⚠ Metric value is None") + + return all_valid + + def cleanup(self): + """Clean up test resources.""" + print("\nCleaning up...") + + if self.blocker_conn: + try: + self.blocker_conn.close() + except Exception: + pass + + if self.target_conn: + try: + with self.target_conn.cursor() as cur: + cur.execute("drop table if exists lock_test_table cascade") + self.target_conn.close() + except Exception: + pass + + print("βœ“ Cleanup complete") + + def run(self) -> bool: + """ + Run the complete test. + + Returns: + True if test passes, False otherwise + """ + try: + self.setup() + self.create_lock_contention(duration_seconds=30) + records = self.verify_metric_collected() + is_valid = self.validate_metric_structure(records) + + if is_valid and records: + print("\nβœ… Test PASSED: lock_waits metric is working correctly") + return True + else: + print("\n❌ Test FAILED: lock_waits metric validation failed") + return False + + except Exception as e: + print(f"\n❌ Test ERROR: {e}") + import traceback + + traceback.print_exc() + return False + finally: + self.cleanup() + + +def main(): + """Main entry point for the test.""" + import argparse + + parser = argparse.ArgumentParser( + description="Test lock_waits metric collection" + ) + parser.add_argument( + "--target-db-url", + default=os.getenv( + "TARGET_DB_URL", "postgresql://postgres:postgres@localhost:55432/target_database" + ), + help="Target database connection URL", + ) + parser.add_argument( + "--prometheus-url", + default=os.getenv( + "PROMETHEUS_URL", + "http://localhost:59090", + ), + help="Prometheus/VictoriaMetrics API URL", + ) + parser.add_argument( + "--test-dbname", + default=os.getenv("TEST_DBNAME", "target_database"), + help="Name of the database being monitored", + ) + parser.add_argument( + "--collection-wait", + type=int, + default=int(os.getenv("COLLECTION_WAIT_SECONDS", "60")), + help="Seconds to wait for pgwatch to collect metrics", + ) + + args = parser.parse_args() + + test = LockWaitsTest( + target_db_url=args.target_db_url, + prometheus_url=args.prometheus_url, + test_dbname=args.test_dbname, + collection_wait_seconds=args.collection_wait, + ) + + success = test.run() + exit(0 if success else 1) + + +if __name__ == "__main__": + main() + diff --git a/tests/reporter/__init__.py b/tests/reporter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/reporter/conftest.py b/tests/reporter/conftest.py new file mode 100644 index 0000000..63b3255 --- /dev/null +++ b/tests/reporter/conftest.py @@ -0,0 +1,58 @@ +from typing import Callable + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add a flag for enabling integration tests that require services.""" + parser.addoption( + "--run-integration", + action="store_true", + default=False, + help="Run tests marked as integration/requires_postgres.", + ) + + +def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None: + """Skip integration tests unless --run-integration is given.""" + if config.getoption("--run-integration"): + return + + skip_marker = pytest.mark.skip(reason="integration tests require --run-integration") + for item in items: + if "integration" in item.keywords or "requires_postgres" in item.keywords: + item.add_marker(skip_marker) + + +@pytest.fixture(name="prom_result") +def fixture_prom_result() -> Callable[[list[dict] | None, str], dict]: + """Build a Prometheus-like payload for the happy-path tests.""" + + def _builder(rows: list[dict] | None = None, status: str = "success") -> dict: + return { + "status": status, + "data": { + "result": rows or [], + }, + } + + return _builder + + +@pytest.fixture(name="series_sample") +def fixture_series_sample() -> Callable[[str, dict | None, list[tuple[float | int, float | int | str]] | None], dict]: + """Create metric entries (metric metadata + values array) for query_range tests.""" + + def _builder( + metric_name: str, + labels: dict | None = None, + values: list[tuple[float | int, float | int | str]] | None = None, + ) -> dict: + labels = labels or {} + values = values or [] + return { + "metric": {"__name__": metric_name, **labels}, + "values": [[ts, str(val)] for ts, val in values], + } + + return _builder diff --git a/tests/reporter/test_formatters.py b/tests/reporter/test_formatters.py new file mode 100644 index 0000000..c5ebf2d --- /dev/null +++ b/tests/reporter/test_formatters.py @@ -0,0 +1,75 @@ +import pytest + +from reporter.postgres_reports import PostgresReportGenerator + + +@pytest.fixture(name="generator") +def fixture_generator() -> PostgresReportGenerator: + return PostgresReportGenerator(prometheus_url="http://test", postgres_sink_url="") + + +@pytest.mark.unit +@pytest.mark.parametrize( + "value,expected", + [ + (0, "0 B"), + (1, "1.00 B"), + (1024, "1.00 KB"), + (10 * 1024, "10.0 KB"), + (1048576, "1.00 MB"), + (5 * 1024 ** 3, "5.00 GB"), + ], +) +def test_format_bytes(generator: PostgresReportGenerator, value: int, expected: str) -> None: + assert generator.format_bytes(value) == expected + + +@pytest.mark.unit +@pytest.mark.parametrize( + "name,value,unit,expected", + [ + ("shared_buffers", "128", "8kB", "1 MB"), + ("work_mem", "512", "", "512 kB"), + ("log_min_duration_statement", "2000", "ms", "2 s"), + ("log_min_duration_statement", "500", "ms", "500 ms"), + ("autovacuum_naptime", "120", "", "2 min"), + ("autovacuum", "on", "", "on"), + ("autovacuum", "OFF", "", "off"), + ], +) +def test_format_setting_value( + generator: PostgresReportGenerator, + name: str, + value: str, + unit: str, + expected: str, +) -> None: + assert generator.format_setting_value(name, value, unit) == expected + + +@pytest.mark.unit +def test_get_cluster_metric_metadata(generator: PostgresReportGenerator) -> None: + assert generator.get_cluster_metric_unit("active_connections") == "connections" + assert generator.get_cluster_metric_description( + "active_connections" + ).startswith("Number of active") + assert generator.get_cluster_metric_unit("unknown") == "" + + +@pytest.mark.unit +def test_get_setting_unit_and_category(generator: PostgresReportGenerator) -> None: + assert generator.get_setting_unit("shared_buffers") == "8kB" + assert generator.get_setting_category("shared_buffers") == "Memory" + assert generator.get_setting_unit("nonexistent") == "" + assert generator.get_setting_category("nonexistent") == "Other" + + +@pytest.mark.unit +def test_format_report_data_structure(generator: PostgresReportGenerator) -> None: + host = "db-1" + payload = generator.format_report_data("A002", {"foo": "bar"}, host) + + assert payload["checkId"] == "A002" + # Newer reporter returns a 'nodes' structure instead of legacy 'hosts'. + assert payload["nodes"]["primary"] == host + assert payload["results"][host]["data"] == {"foo": "bar"} diff --git a/tests/reporter/test_generators_unit.py b/tests/reporter/test_generators_unit.py new file mode 100644 index 0000000..0c3f5e1 --- /dev/null +++ b/tests/reporter/test_generators_unit.py @@ -0,0 +1,1078 @@ +import json +import sys +from datetime import datetime, timedelta +from typing import Any, Callable + +import pytest + +from reporter import postgres_reports as postgres_reports_module +from reporter.postgres_reports import PostgresReportGenerator + + +@pytest.fixture(name="generator") +def fixture_generator() -> PostgresReportGenerator: + return PostgresReportGenerator( + prometheus_url="http://prom.test", + postgres_sink_url="", + ) + + +def _success_metric(value: str) -> dict[str, Any]: + return { + "status": "success", + "data": { + "result": [ + { + "value": [datetime.now().timestamp(), value], + } + ] + }, + } + + +def _query_stub_factory(prom_result, mapping: dict[str, Any]) -> Callable[[str], dict[str, Any]]: + """Return a query_instant stub that matches substrings defined in mapping keys. + + Args: + prom_result: Fallback callable that returns a default Prometheus response + mapping: Dict mapping query substrings to responses (either dict or callable) + + Returns: + A callable that takes a query string and returns a Prometheus-like response + """ + + def _fake(query: str) -> dict[str, Any]: + for needle, payload in mapping.items(): + if needle in query: + return payload(query) if callable(payload) else payload + return prom_result() + + return _fake + + +@pytest.mark.unit +def test_query_instant_hits_prometheus( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, +) -> None: + captured: dict[str, Any] = {} + + class DummyResponse: + status_code = 200 + text = "{}" + + @staticmethod + def json() -> dict[str, Any]: + return {"status": "success", "data": {"result": []}} + + def fake_get( + url: str, + params: dict[str, Any] | None = None, + timeout: int | None = None, + ): + captured["url"] = url + captured["params"] = params + return DummyResponse() + + monkeypatch.setattr(postgres_reports_module.requests, "get", fake_get) + + payload = generator.query_instant("up") + + assert payload["status"] == "success" + assert captured["url"].endswith("/api/v1/query") + assert captured["params"] == {"query": "up"} + + +@pytest.mark.unit +def test_query_range_hits_prometheus( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, +) -> None: + start = datetime(2024, 1, 1, 0, 0, 0) + end = start + timedelta(minutes=5) + captured: dict[str, Any] = {} + + class DummyResponse: + status_code = 200 + text = "{}" + + @staticmethod + def json() -> dict[str, Any]: + return {"status": "success", "data": {"result": []}} + + def fake_get( + url: str, + params: dict[str, Any] | None = None, + timeout: int | None = None, + ): + captured["url"] = url + captured["params"] = params + return DummyResponse() + + monkeypatch.setattr(postgres_reports_module.requests, "get", fake_get) + + payload = generator.query_range("up", start, end, step="60s") + + assert payload == [] + assert captured["url"].endswith("/api/v1/query_range") + assert captured["params"]["query"] == "up" + assert captured["params"]["start"] == start.timestamp() + + +@pytest.mark.unit +def test_generate_a002_version_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, +) -> None: + values = { + "server_version": "15.3", + "server_version_num": "150003", + "max_connections": "200", + "shared_buffers": "1024", + "effective_cache_size": "2048", + } + + def fake_query(query: str) -> dict[str, Any]: + # A002 uses a helper that queries both settings via a single regex selector. + if 'setting_name=~"server_version|server_version_num"' in query: + return { + "status": "success", + "data": { + "result": [ + { + "metric": { + "setting_name": "server_version", + "setting_value": values["server_version"], + } + }, + { + "metric": { + "setting_name": "server_version_num", + "setting_value": values["server_version_num"], + } + }, + ] + }, + } + return {"status": "success", "data": {"result": []}} + + monkeypatch.setattr(generator, "query_instant", fake_query) + + report = generator.generate_a002_version_report("local", "node-1") + version = report["results"]["node-1"]["data"]["version"] + + assert version["version"] == "15.3" + assert version["server_major_ver"] == "15" + assert version["server_minor_ver"] == "3" + + +@pytest.mark.unit +def test_generate_a004_cluster_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, +) -> None: + def fake_query(query: str) -> dict[str, Any]: + if "pgwatch_db_size_size_b" in query and "sum(" not in query: + return { + "status": "success", + "data": { + "result": [ + {"metric": {"datname": "db1"}, "value": [0, "1024"]}, + {"metric": {"datname": "db2"}, "value": [0, "2048"]}, + ] + }, + } + return _success_metric("42") + + monkeypatch.setattr(generator, "query_instant", fake_query) + + report = generator.generate_a004_cluster_report("local", "node-1") + data = report["results"]["node-1"]["data"] + + assert "general_info" in data and "database_sizes" in data + assert data["general_info"]["active_connections"]["value"] == "42" + assert data["database_sizes"] == {"db1": 1024.0, "db2": 2048.0} + + +@pytest.mark.unit +def test_prometheus_to_dict_and_process_pgss(generator: PostgresReportGenerator) -> None: + base_time = datetime(2024, 1, 1, 0, 0, 0) + later_time = base_time + timedelta(seconds=60) + + def make_metric(name: str, value: float, ts: datetime) -> dict[str, Any]: + return { + "metric": { + "__name__": name, + "datname": "db1", + "queryid": "123", + "user": "postgres", + "instance": "inst1", + }, + "values": [[ts.timestamp(), str(value)]], + } + + start_metrics = [ + make_metric("pgwatch_pg_stat_statements_calls", 10, base_time), + make_metric("pgwatch_pg_stat_statements_exec_time_total", 1000, base_time), + make_metric("pgwatch_pg_stat_statements_rows", 200, base_time), + ] + end_metrics = [ + make_metric("pgwatch_pg_stat_statements_calls", 40, later_time), + make_metric("pgwatch_pg_stat_statements_exec_time_total", 4000, later_time), + make_metric("pgwatch_pg_stat_statements_rows", 260, later_time), + ] + + mapping = { + "calls": "calls", + "exec_time_total": "total_time", + "rows": "rows", + } + + rows = generator._process_pgss_data( + start_metrics, + end_metrics, + base_time, + later_time, + mapping, + ) + + assert len(rows) == 1 + row = rows[0] + assert row["calls"] == 30 + assert row["total_time"] == 3000 + assert pytest.approx(row["total_time_per_sec"], 0.01) == 50 + assert row["rows_per_call"] == pytest.approx(2.0) + + +@pytest.mark.unit +def test_prometheus_to_dict_closest_value(generator: PostgresReportGenerator) -> None: + reference_time = datetime(2024, 1, 1, 12, 0, 0) + + prom_data: list[dict[str, Any]] = [ + { + "metric": { + "__name__": "pgwatch_pg_stat_statements_calls", + "datname": "db1", + "queryid": "q1", + "user": "postgres", + "instance": "inst1", + }, + "values": [ + [reference_time.timestamp() - 10, "10"], + [reference_time.timestamp() + 5, "20"], + ], + } + ] + + converted = generator._prometheus_to_dict(prom_data, reference_time) + + key = ("db1", "q1", "postgres", "inst1") + assert key in converted + assert converted[key]["calls"] == 20 + + +@pytest.mark.unit +def test_generate_a003_settings_report(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + def fake_query(query: str) -> dict[str, Any]: + assert "pgwatch_settings_configured" in query + return { + "status": "success", + "data": { + "result": [ + { + "metric": { + "setting_name": "shared_buffers", + "setting_value": "128", + "category": "Memory", + "unit": "8kB", + "context": "postmaster", + "vartype": "integer", + } + }, + { + "metric": { + "setting_name": "work_mem", + "setting_value": "512", + "category": "Memory", + "unit": "", + "context": "user", + "vartype": "integer", + } + }, + ] + }, + } + + monkeypatch.setattr(generator, "query_instant", fake_query) + + report = generator.generate_a003_settings_report("local", "node-1") + data = report["results"]["node-1"]["data"] + + assert data["shared_buffers"]["pretty_value"] == "1 MB" + assert data["work_mem"]["unit"] == "" + assert data["work_mem"]["category"] == "Memory" + + +@pytest.mark.unit +def test_generate_a007_altered_settings_report(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + def fake_query(query: str) -> dict[str, Any]: + # Handle version info query from _get_postgres_version_info + if 'setting_name=~"server_version|server_version_num"' in query: + return { + "status": "success", + "data": { + "result": [ + {"metric": {"setting_name": "server_version", "setting_value": "15.0"}}, + {"metric": {"setting_name": "server_version_num", "setting_value": "150000"}}, + ] + }, + } + # Handle altered settings query + assert "pgwatch_settings_is_default" in query + return { + "status": "success", + "data": { + "result": [ + { + "metric": { + "setting_name": "work_mem", + "setting_value": "1024", + "unit": "", + "category": "Memory", + } + }, + { + "metric": { + "setting_name": "autovacuum", + "setting_value": "off", + "unit": "", + "category": "Autovacuum", + } + }, + ] + }, + } + + monkeypatch.setattr(generator, "query_instant", fake_query) + + payload = generator.generate_a007_altered_settings_report("local", "node-1") + data = payload["results"]["node-1"]["data"] + + assert set(data.keys()) == {"work_mem", "autovacuum"} + assert "postgres_version" in payload["results"]["node-1"] # postgres_version is at node level + assert data["work_mem"]["pretty_value"] == "1 MB" + assert data["autovacuum"]["pretty_value"] == "off" + + +@pytest.mark.unit +def test_get_all_databases_merges_sources(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + def fake_query(query: str) -> dict[str, Any]: + if "wraparound" in query: + return { + "status": "success", + "data": { + "result": [ + {"metric": {"datname": "appdb"}, "value": [0, "1"]}, + {"metric": {"datname": "template0"}, "value": [0, "1"]}, + ] + }, + } + if "unused_indexes" in query: + return { + "status": "success", + "data": { + "result": [ + {"metric": {"dbname": "analytics"}, "value": [0, "1"]}, + {"metric": {"dbname": "appdb"}, "value": [0, "1"]}, + ] + }, + } + if "redundant_indexes" in query: + return { + "status": "success", + "data": { + "result": [ + {"metric": {"dbname": "warehouse"}, "value": [0, "1"]}, + ] + }, + } + if "pg_btree_bloat_bloat_pct" in query: + return { + "status": "success", + "data": { + "result": [ + {"metric": {"datname": "inventory"}, "value": [0, "1"]}, + ] + }, + } + return {"status": "success", "data": {"result": []}} + + monkeypatch.setattr(generator, "query_instant", fake_query) + + databases = generator.get_all_databases("local", "node-1") + + assert databases == ["appdb", "analytics", "warehouse", "inventory"] + + +@pytest.mark.unit +def test_check_pg_stat_kcache_status(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, prom_result) -> None: + responses = { + "pgwatch_pg_stat_kcache_exec_total_time": prom_result( + [ + { + "metric": {"queryid": "1", "tag_user": "postgres"}, + "value": [0, "10"], + } + ] + ), + "pgwatch_pg_stat_kcache_exec_user_time": prom_result([{"metric": {}, "value": [0, "4"]}]), + "pgwatch_pg_stat_kcache_exec_system_time": prom_result([{"metric": {}, "value": [0, "6"]}]), + } + monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) + + status = generator._check_pg_stat_kcache_status("local", "node-1") + + assert status["extension_available"] is True + assert status["metrics_count"] == 1 + assert status["total_exec_time"] == 10.0 + assert status["total_user_time"] == 4.0 + assert status["sample_queries"][0]["queryid"] == "1" + + +@pytest.mark.unit +def test_check_pg_stat_statements_status(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, prom_result) -> None: + response = prom_result( + [ + { + "metric": {"queryid": "1", "tag_user": "postgres", "datname": "db1"}, + "value": [0, "5"], + } + ] + ) + monkeypatch.setattr(generator, "query_instant", lambda query: response) + + status = generator._check_pg_stat_statements_status("local", "node-1") + + assert status["extension_available"] is True + assert status["metrics_count"] == 1 + assert status["total_calls"] == 5.0 + assert status["sample_queries"][0]["database"] == "db1" + + +@pytest.mark.unit +def test_generate_h001_invalid_indexes_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, + prom_result, +) -> None: + monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["maindb"]) + + responses = { + "pgwatch_pg_invalid_indexes": prom_result( + [ + { + "metric": { + "schema_name": "public", + "table_name": "tbl", + "index_name": "idx_invalid", + "relation_name": "public.tbl", + "supports_fk": "1", + }, + "value": [0, "2048"], + } + ] + ) + } + monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) + + payload = generator.generate_h001_invalid_indexes_report("local", "node-1") + db_data = payload["results"]["node-1"]["data"]["maindb"] + + assert db_data["total_count"] == 1 + assert db_data["total_size_bytes"] == 2048.0 + entry = db_data["invalid_indexes"][0] + assert entry["index_name"] == "idx_invalid" + assert entry["index_size_pretty"].endswith("KB") + assert entry["supports_fk"] is True + + +@pytest.mark.unit +def test_generate_h002_unused_indexes_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, + prom_result, +) -> None: + monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["app"]) + monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_unused": "CREATE INDEX idx_unused ON t(c)"}) + + responses = { + "pgwatch_db_stats_postmaster_uptime_s": prom_result([{"value": [0, "3600"]}]), + "pgwatch_stats_reset_stats_reset_epoch": prom_result([{"value": [0, "1700000000"]}]), + "pgwatch_unused_indexes_index_size_bytes": prom_result( + [ + { + "metric": { + "schema_name": "public", + "table_name": "tbl", + "index_name": "idx_unused", + "reason": "never scanned", + "idx_is_btree": "true", + "supports_fk": "0", + }, + "value": [0, "1024"], + } + ] + ), + "pgwatch_unused_indexes_idx_scan": prom_result([{"value": [0, "0"]}]), + } + monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) + + payload = generator.generate_h002_unused_indexes_report("local", "node-1") + db_data = payload["results"]["node-1"]["data"]["app"] + + assert db_data["total_count"] == 1 + unused = db_data["unused_indexes"][0] + assert unused["index_definition"].startswith("CREATE INDEX") + assert unused["idx_scan"] == 0 + assert unused["index_size_pretty"].endswith("KB") + stats_reset = db_data["stats_reset"] + assert stats_reset["stats_reset_epoch"] == 1700000000.0 + assert stats_reset["postmaster_startup_epoch"] is not None + + +@pytest.mark.unit +def test_generate_h004_redundant_indexes_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, + prom_result, +) -> None: + monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["app"]) + monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_dup": "CREATE INDEX idx_dup ON t(c)"}) + + responses = { + "pgwatch_redundant_indexes_index_size_bytes": prom_result( + [ + { + "metric": { + "schema_name": "public", + "table_name": "tbl", + "index_name": "idx_dup", + "relation_name": "public.tbl", + "access_method": "btree", + "reason": "covers columns", + }, + "value": [0, "4096"], + } + ] + ), + "pgwatch_redundant_indexes_table_size_bytes": prom_result([{"value": [0, "8192"]}]), + "pgwatch_redundant_indexes_index_usage": prom_result([{"value": [0, "2"]}]), + "pgwatch_redundant_indexes_supports_fk": prom_result([{"value": [0, "1"]}]), + } + monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) + + payload = generator.generate_h004_redundant_indexes_report("local", "node-1") + db_data = payload["results"]["node-1"]["data"]["app"] + + assert db_data["total_count"] == 1 + redundant = db_data["redundant_indexes"][0] + assert redundant["index_definition"].startswith("CREATE INDEX") + assert redundant["index_usage"] == 2.0 + assert redundant["index_size_pretty"].endswith("KB") + assert redundant["supports_fk"] is True + + +@pytest.mark.unit +def test_generate_d004_pgstat_settings_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, + prom_result, +) -> None: + responses = { + "pgwatch_settings_configured": prom_result( + [ + { + "metric": { + "setting_name": "pg_stat_statements.max", + "setting_value": "1000", + "category": "Stats", + "unit": "", + "context": "postmaster", + "vartype": "integer", + } + } + ] + ) + } + monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) + monkeypatch.setattr(generator, "_check_pg_stat_kcache_status", lambda *args, **kwargs: {"extension_available": True}) + monkeypatch.setattr(generator, "_check_pg_stat_statements_status", lambda *args, **kwargs: {"extension_available": False}) + + payload = generator.generate_d004_pgstat_settings_report("local", "node-1") + data = payload["results"]["node-1"]["data"] + + assert "pg_stat_statements.max" in data["settings"] + assert data["pg_stat_kcache_status"]["extension_available"] is True + + +@pytest.mark.unit +def test_generate_f001_autovacuum_settings_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, + prom_result, +) -> None: + responses = { + "pgwatch_settings_configured": prom_result( + [ + { + "metric": { + "setting_name": "autovacuum_naptime", + "setting_value": "60", + "category": "Autovacuum", + "unit": "", + "context": "sighup", + "vartype": "integer", + } + } + ] + ) + } + monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) + + payload = generator.generate_f001_autovacuum_settings_report("local", "node-1") + data = payload["results"]["node-1"]["data"] + + assert data["autovacuum_naptime"]["setting"] == "60" + assert data["autovacuum_naptime"]["pretty_value"] == "1 min" + + +@pytest.mark.unit +def test_generate_f005_btree_bloat_report( + monkeypatch: pytest.MonkeyPatch, + generator: PostgresReportGenerator, + prom_result, +) -> None: + monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) + + responses = { + "pgwatch_pg_btree_bloat_extra_size": prom_result( + [ + { + "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, + "value": [0, "1024"], + } + ] + ), + "pgwatch_pg_btree_bloat_extra_pct": prom_result( + [ + { + "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, + "value": [0, "20"], + } + ] + ), + "pgwatch_pg_btree_bloat_bloat_size": prom_result( + [ + { + "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, + "value": [0, "2048"], + } + ] + ), + "pgwatch_pg_btree_bloat_bloat_pct": prom_result( + [ + { + "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, + "value": [0, "50"], + } + ] + ), + } + monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) + + payload = generator.generate_f005_btree_bloat_report("local", "node-1") + db_data = payload["results"]["node-1"]["data"]["db1"] + entry = db_data["bloated_indexes"][0] + + assert entry["extra_size"] == 1024.0 + assert entry["bloat_pct"] == 50.0 + assert entry["bloat_size_pretty"].endswith("KB") + + +@pytest.mark.unit +def test_get_pgss_metrics_data_by_db_invokes_all_metrics(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + captured: list[str] = [] + + def fake_query_range(query: str, start, end, step: str = "30s") -> list[dict]: + captured.append(query) + return [] + + monkeypatch.setattr(generator, "query_range", fake_query_range) + sentinel = [{"result": "ok"}] + monkeypatch.setattr(generator, "_process_pgss_data", lambda *args, **kwargs: sentinel) + + start = datetime(2024, 1, 1, 0, 0, 0) + end = start + timedelta(hours=1) + result = generator._get_pgss_metrics_data_by_db("local", "node-1", "db1", start, end) + + assert result == sentinel + # Ensure at least one representative metric was queried with filters + assert any("pgwatch_pg_stat_statements_calls" in q for q in captured) + + +@pytest.mark.unit +def test_generate_all_reports_invokes_every_builder(monkeypatch: pytest.MonkeyPatch) -> None: + generator = PostgresReportGenerator() + called: list[str] = [] + + def stub(name: str): + def _(*args, **kwargs): + called.append(name) + return {name: True} + + return _ + + builders = [ + "generate_a002_version_report", + "generate_a003_settings_report", + "generate_a004_cluster_report", + "generate_a007_altered_settings_report", + "generate_d004_pgstat_settings_report", + "generate_f001_autovacuum_settings_report", + "generate_f004_heap_bloat_report", + "generate_f005_btree_bloat_report", + "generate_g001_memory_settings_report", + "generate_h001_invalid_indexes_report", + "generate_h002_unused_indexes_report", + "generate_h004_redundant_indexes_report", + "generate_k001_query_calls_report", + "generate_k003_top_queries_report", + ] + + for name in builders: + monkeypatch.setattr(generator, name, stub(name)) + + reports = generator.generate_all_reports("local", "node-1") + + assert set(reports.keys()) == {code.split("_")[1].upper() for code in builders} + assert set(called) == set(builders) + + +@pytest.mark.unit +def test_create_report_uses_api(monkeypatch: pytest.MonkeyPatch) -> None: + generator = PostgresReportGenerator() + payloads: list[dict] = [] + + def fake_make_request(api_url, endpoint, request_data): + payloads.append({"endpoint": endpoint, "data": request_data}) + return {"report_id": 42} + + monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) + + report_id = generator.create_report("https://api", "tok", "proj", "123") + + assert report_id == 42 + assert payloads[0]["endpoint"] == "/rpc/checkup_report_create" + assert payloads[0]["data"]["project"] == "proj" + + +@pytest.mark.unit +def test_upload_report_file_sends_contents(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None: + generator = PostgresReportGenerator() + captured: dict = {} + + def fake_make_request(api_url, endpoint, request_data): + captured["endpoint"] = endpoint + captured["data"] = request_data + return {} + + monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) + + report_file = tmp_path / "A002_report.json" + report_file.write_text('{"foo": "bar"}', encoding="utf-8") + + generator.upload_report_file("https://api", "tok", 100, str(report_file)) + + assert captured["endpoint"] == "/rpc/checkup_report_file_post" + assert captured["data"]["check_id"] == "A002" + assert captured["data"]["filename"] == report_file.name + + +@pytest.mark.unit +def test_main_runs_specific_check_without_upload(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + class DummyGenerator: + DEFAULT_EXCLUDED_DATABASES = {'template0', 'template1', 'rdsadmin', 'azure_maintenance', 'cloudsqladmin'} + + def __init__(self, *args, **kwargs): + self.closed = False + + def get_all_clusters(self): + # Match current reporter.main() behavior which always calls + # get_all_clusters() when cluster is not explicitly provided. + return ["local"] + + def test_connection(self) -> bool: + return True + + def generate_a002_version_report(self, cluster, node_name): + return {"checkId": "A002", "results": {node_name: {"data": {"ok": True}}}} + + def close_postgres_sink(self): + self.closed = True + + monkeypatch.setattr(postgres_reports_module, "PostgresReportGenerator", DummyGenerator) + monkeypatch.setattr(sys, "argv", ["postgres_reports.py", "--check-id", "A002", "--output", "-", "--no-upload"]) + + postgres_reports_module.main() + + captured = capsys.readouterr().out + + # main() prints progress banners along with the JSON payload. + # Extract the JSON object from the captured stdout by finding the + # first line that looks like JSON and joining from there. + lines = captured.splitlines() + start_idx = 0 + for i, line in enumerate(lines): + if line.strip().startswith("{"): + start_idx = i + break + json_str = "\n".join(lines[start_idx:]) + + output = json.loads(json_str) + assert output["checkId"] == "A002" + assert "results" in output + + +@pytest.mark.unit +def test_main_exits_when_connection_fails(monkeypatch: pytest.MonkeyPatch) -> None: + class FailingGenerator: + DEFAULT_EXCLUDED_DATABASES = {'template0', 'template1', 'rdsadmin', 'azure_maintenance', 'cloudsqladmin'} + + def __init__(self, *args, **kwargs): + pass + + def test_connection(self) -> bool: + return False + + monkeypatch.setattr(postgres_reports_module, "PostgresReportGenerator", FailingGenerator) + monkeypatch.setattr(sys, "argv", ["postgres_reports.py", "--check-id", "A002"]) + + with pytest.raises(SystemExit): + postgres_reports_module.main() + + +# ============================================================================ +# Negative test cases - Error handling +# ============================================================================ + + +@pytest.mark.unit +def test_query_instant_handles_http_404_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_instant returns empty dict on HTTP 404 error.""" + class MockResponse: + status_code = 404 + text = "Not Found" + + def json(self): + return {"error": "not found"} + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + return MockResponse() + + monkeypatch.setattr("requests.get", fake_get) + + result = generator.query_instant("test_query") + + assert result == {} + + +@pytest.mark.unit +def test_query_instant_handles_http_500_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_instant returns empty dict on HTTP 500 error.""" + class MockResponse: + status_code = 500 + text = "Internal Server Error" + + def json(self): + raise ValueError("Invalid JSON") + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + return MockResponse() + + monkeypatch.setattr("requests.get", fake_get) + + result = generator.query_instant("test_query") + + assert result == {} + + +@pytest.mark.unit +def test_query_instant_handles_timeout(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_instant returns empty dict on request timeout.""" + import requests + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + raise requests.Timeout("Connection timed out") + + monkeypatch.setattr("requests.get", fake_get) + + result = generator.query_instant("test_query") + + assert result == {} + + +@pytest.mark.unit +def test_query_instant_handles_connection_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_instant returns empty dict on connection error.""" + import requests + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + raise requests.ConnectionError("Failed to establish connection") + + monkeypatch.setattr("requests.get", fake_get) + + result = generator.query_instant("test_query") + + assert result == {} + + +@pytest.mark.unit +def test_query_instant_handles_malformed_json(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_instant returns empty dict when response has invalid JSON.""" + class MockResponse: + status_code = 200 + + def json(self): + raise ValueError("Invalid JSON") + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + return MockResponse() + + monkeypatch.setattr("requests.get", fake_get) + + result = generator.query_instant("test_query") + + assert result == {} + + +@pytest.mark.unit +def test_query_range_handles_http_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_range returns empty list on HTTP error.""" + class MockResponse: + status_code = 503 + text = "Service Unavailable" + + def json(self): + return {"error": "service unavailable"} + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + return MockResponse() + + monkeypatch.setattr("requests.get", fake_get) + + start = datetime.now() + end = start + timedelta(hours=1) + result = generator.query_range("test_query", start, end) + + assert result == [] + + +@pytest.mark.unit +def test_query_range_handles_timeout(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_range returns empty list on timeout.""" + import requests + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + raise requests.Timeout("Connection timed out") + + monkeypatch.setattr("requests.get", fake_get) + + start = datetime.now() + end = start + timedelta(hours=1) + result = generator.query_range("test_query", start, end) + + assert result == [] + + +@pytest.mark.unit +def test_query_range_handles_malformed_response(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_range handles response with missing expected fields.""" + class MockResponse: + status_code = 200 + + def json(self): + # Missing 'data' or 'result' fields + return {"status": "success"} + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + return MockResponse() + + monkeypatch.setattr("requests.get", fake_get) + + start = datetime.now() + end = start + timedelta(hours=1) + result = generator.query_range("test_query", start, end) + + assert result == [] + + +@pytest.mark.unit +def test_query_range_handles_failed_status(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: + """Test that query_range handles Prometheus error status.""" + class MockResponse: + status_code = 200 + + def json(self): + return { + "status": "error", + "errorType": "bad_data", + "error": "invalid query" + } + + def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): + return MockResponse() + + monkeypatch.setattr("requests.get", fake_get) + + start = datetime.now() + end = start + timedelta(hours=1) + result = generator.query_range("test_query", start, end) + + assert result == [] + + +@pytest.mark.unit +def test_make_request_raises_on_http_error(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that make_request raises exception on HTTP error.""" + class MockResponse: + status_code = 400 + + def raise_for_status(self): + import requests + raise requests.HTTPError("400 Client Error") + + def json(self): + return {} + + def fake_post(url: str, json: dict[str, Any] | None = None): + return MockResponse() + + monkeypatch.setattr("requests.post", fake_post) + + import requests + with pytest.raises(requests.HTTPError): + postgres_reports_module.make_request("http://api.test", "/endpoint", {"data": "test"}) + + +@pytest.mark.unit +def test_make_request_raises_on_connection_error(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that make_request raises exception on connection error.""" + import requests + + def fake_post(url: str, json: dict[str, Any] | None = None): + raise requests.ConnectionError("Connection failed") + + monkeypatch.setattr("requests.post", fake_post) + + with pytest.raises(requests.ConnectionError): + postgres_reports_module.make_request("http://api.test", "/endpoint", {"data": "test"}) diff --git a/tests/reporter/test_postgres_integration.py b/tests/reporter/test_postgres_integration.py new file mode 100644 index 0000000..414d009 --- /dev/null +++ b/tests/reporter/test_postgres_integration.py @@ -0,0 +1,75 @@ +import json +from datetime import datetime, timezone +from typing import Callable, Tuple + +import pytest + +from reporter.postgres_reports import PostgresReportGenerator + +Seeder = Callable[[str, str, str], None] + + +@pytest.fixture(scope="function") +def sink_index_data(postgresql) -> Tuple[str, Seeder]: + conn = postgresql + conn.autocommit = True + cur = conn.cursor() + cur.execute( + """ + create table if not exists public.index_definitions ( + time timestamptz not null, + dbname text not null, + data jsonb not null, + tag_data jsonb + ) + """ + ) + + def seed(dbname: str, index_name: str, index_def: str) -> None: + payload = { + "indexrelname": index_name, + "index_definition": index_def, + "schemaname": "public", + "relname": "tbl", + } + with conn.cursor() as seed_cur: + seed_cur.execute( + ( + "insert into public.index_definitions " + "(time, dbname, data) values (%s, %s, %s::jsonb)" + ), + (datetime.now(timezone.utc), dbname, json.dumps(payload)), + ) + + host = conn.info.host or conn.info.hostaddr or "localhost" + port = conn.info.port + user = conn.info.user + dbname = conn.info.dbname + dsn = f"postgresql://{user}@{host}:{port}/{dbname}" + + yield dsn, seed + + cur.execute("truncate table public.index_definitions") + cur.close() + + +@pytest.mark.integration +@pytest.mark.requires_postgres +def test_get_index_definitions_from_sink(sink_index_data) -> None: + dsn, seed = sink_index_data + seed("db1", "idx_users", "CREATE INDEX idx_users ON users(id)") + seed("db2", "idx_orders", "CREATE INDEX idx_orders ON orders(id)") + + generator = PostgresReportGenerator( + prometheus_url="http://unused", + postgres_sink_url=dsn, + ) + assert generator.connect_postgres_sink() + + definitions = generator.get_index_definitions_from_sink() + + assert definitions["db1.idx_users"] == "CREATE INDEX idx_users ON users(id)" + assert definitions["db2.idx_orders"] == "CREATE INDEX idx_orders ON orders(id)" + + generator.close_postgres_sink() + assert generator.pg_conn is None