From fb19bd965fce659ca6250e475368afbaa4e4392a Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 19:27:27 -0700 Subject: [PATCH 01/14] added documentation from pganalyze --- .github/workflows/aws-rds-opentofu.yml | 114 ++++++++++++++++++++ .github/workflows/gcp-cloudsql-opentofu.yml | 111 +++++++++++++++++++ pgFirstAid.sql | 68 ++++++++++++ testing/pgTAP/03_high_tests.sql | 29 ++++- view_pgFirstAid.sql | 62 +++++++++++ view_pgFirstAid_managed.sql | 62 +++++++++++ 6 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/aws-rds-opentofu.yml create mode 100644 .github/workflows/gcp-cloudsql-opentofu.yml diff --git a/.github/workflows/aws-rds-opentofu.yml b/.github/workflows/aws-rds-opentofu.yml new file mode 100644 index 0000000..f6b1f13 --- /dev/null +++ b/.github/workflows/aws-rds-opentofu.yml @@ -0,0 +1,114 @@ +name: AWS RDS OpenTofu + +on: + workflow_dispatch: + inputs: + action: + description: "OpenTofu action" + required: true + type: choice + default: plan + options: + - plan + - apply + - destroy + postgres_version: + description: "Target PostgreSQL version" + required: true + type: choice + default: pg18 + options: + - pg15 + - pg16 + - pg17 + - pg18 + allowed_cidr_block: + description: "CIDR allowed to connect (example: 203.0.113.10/32). Leave blank to use AWS_ALLOWED_CIDR_BLOCK secret." + required: false + type: string + aws_region: + description: "AWS region" + required: true + type: string + default: us-west-2 + +concurrency: + group: aws-rds-${{ inputs.postgres_version }} + cancel-in-progress: false + +jobs: + opentofu: + name: ${{ inputs.action }} ${{ inputs.postgres_version }} + runs-on: [self-hosted, linux, pgfirstaid-ci] + permissions: + contents: read + id-token: write + defaults: + run: + working-directory: testing/aws/deploy/${{ inputs.postgres_version }} + + env: + TF_IN_AUTOMATION: "true" + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + clean: false + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + + - name: Configure AWS credentials (OIDC role) + if: ${{ secrets.AWS_ROLE_TO_ASSUME != '' }} + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ inputs.aws_region }} + + - name: Configure AWS credentials (access keys) + if: ${{ secrets.AWS_ROLE_TO_ASSUME == '' }} + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-session-token: ${{ secrets.AWS_SESSION_TOKEN }} + aws-region: ${{ inputs.aws_region }} + + - name: Resolve allowed CIDR + shell: bash + run: | + CIDR="${{ inputs.allowed_cidr_block }}" + if [ -z "$CIDR" ]; then + CIDR="${{ secrets.AWS_ALLOWED_CIDR_BLOCK }}" + fi + + if [ -z "$CIDR" ]; then + echo "::error::No allowed CIDR provided. Set input 'allowed_cidr_block' or secret 'AWS_ALLOWED_CIDR_BLOCK'." + exit 1 + fi + + echo "TF_VAR_allowed_cidr_block=$CIDR" >> "$GITHUB_ENV" + + - name: OpenTofu init + run: tofu init -input=false + + - name: OpenTofu validate + run: tofu validate + + - name: OpenTofu plan + if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} + run: tofu plan -input=false -out=tfplan + + - name: OpenTofu apply + if: ${{ inputs.action == 'apply' }} + run: tofu apply -input=false -auto-approve tfplan + + - name: Show endpoint + if: ${{ inputs.action == 'apply' }} + run: | + echo "RDS endpoint: $(tofu output -raw endpoint)" + + - name: OpenTofu destroy + if: ${{ inputs.action == 'destroy' }} + run: tofu destroy -input=false -auto-approve diff --git a/.github/workflows/gcp-cloudsql-opentofu.yml b/.github/workflows/gcp-cloudsql-opentofu.yml new file mode 100644 index 0000000..72bec76 --- /dev/null +++ b/.github/workflows/gcp-cloudsql-opentofu.yml @@ -0,0 +1,111 @@ +name: GCP Cloud SQL OpenTofu + +on: + workflow_dispatch: + inputs: + action: + description: "OpenTofu action" + required: true + type: choice + default: plan + options: + - plan + - apply + - destroy + postgres_version: + description: "Target PostgreSQL version" + required: true + type: choice + default: pg18 + options: + - pg15 + - pg16 + - pg17 + - pg18 + personal_ip: + description: "IP or CIDR allowed to connect (example: 203.0.113.10/32). Leave blank to use GCP_PERSONAL_IP secret." + required: false + type: string + +concurrency: + group: gcp-cloudsql-${{ inputs.postgres_version }} + cancel-in-progress: false + +jobs: + opentofu: + name: ${{ inputs.action }} ${{ inputs.postgres_version }} + runs-on: [self-hosted, linux, pgfirstaid-ci] + permissions: + contents: read + id-token: write + defaults: + run: + working-directory: testing/gcp/deploy/${{ inputs.postgres_version }} + + env: + TF_IN_AUTOMATION: "true" + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + clean: false + + - name: Authenticate to Google Cloud (OIDC) + if: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER != '' && secrets.GCP_SERVICE_ACCOUNT != '' }} + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + + - name: Authenticate to Google Cloud (service account key) + if: ${{ !(secrets.GCP_WORKLOAD_IDENTITY_PROVIDER != '' && secrets.GCP_SERVICE_ACCOUNT != '') }} + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_CREDENTIALS_JSON }} + + - name: Setup gcloud CLI + uses: google-github-actions/setup-gcloud@v2 + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + + - name: Resolve personal IP + shell: bash + run: | + PERSONAL_IP="${{ inputs.personal_ip }}" + if [ -z "$PERSONAL_IP" ]; then + PERSONAL_IP="${{ secrets.GCP_PERSONAL_IP }}" + fi + + if [ -z "$PERSONAL_IP" ]; then + echo "::error::No personal IP provided. Set input 'personal_ip' or secret 'GCP_PERSONAL_IP'." + exit 1 + fi + + echo "TF_VAR_personal_ip=$PERSONAL_IP" >> "$GITHUB_ENV" + + - name: OpenTofu init + run: tofu init -input=false + + - name: OpenTofu validate + run: tofu validate + + - name: OpenTofu plan + if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} + run: tofu plan -input=false -out=tfplan + + - name: OpenTofu apply + if: ${{ inputs.action == 'apply' }} + run: tofu apply -input=false -auto-approve tfplan + + - name: Show connection details + if: ${{ inputs.action == 'apply' }} + run: | + echo "Instance: $(tofu output -raw instance_name)" + echo "Connection: $(tofu output -raw instance_connection_name)" + echo "Public IP: $(tofu output -raw public_ip_address)" + + - name: OpenTofu destroy + if: ${{ inputs.action == 'destroy' }} + run: tofu destroy -input=false -auto-approve diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 05795a9..af9d439 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -421,6 +421,74 @@ select 3 as severity_order from bq; +-- MEDIUM: Deadlocks since stats reset +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlocks Since Stats Reset' as check_name, + psd.datname as object_name, + 'Deadlocks have been detected since the last PostgreSQL statistics reset' as issue_description, + 'Deadlocks: ' || psd.deadlocks || ', Stats reset: ' || psd.stats_reset as current_value, + 'Capture deadlock details in logs and review transaction ordering to reduce lock cycles' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0; +-- MEDIUM: Deadlock rate per hour +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlock Rate Per Hour' as check_name, + psd.datname as object_name, + 'Deadlocks are occurring often relative to time since stats reset' as issue_description, + 'Deadlocks/hour: ' || round( + psd.deadlocks / GREATEST(EXTRACT(EPOCH FROM (now() - psd.stats_reset)) / 3600.0, 1), + 3 + ) || ', Deadlocks: ' || psd.deadlocks as current_value, + 'Investigate recent lock contention and reduce transaction overlap on the same rows' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0; +-- MEDIUM: Deadlocks compared to transaction volume +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlocks as Percent of Transactions' as check_name, + psd.datname as object_name, + 'Deadlocks are impacting a measurable share of transaction volume' as issue_description, + 'Deadlocks: ' || psd.deadlocks || ', Total transactions: ' || (psd.xact_commit + psd.xact_rollback) || + ', Deadlock percent: ' || round( + 100.0 * psd.deadlocks / NULLIF(psd.xact_commit + psd.xact_rollback, 0), + 4 + ) || '%' as current_value, + 'Review application retry logic and lock acquisition patterns for frequently conflicting transactions' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0 + and (psd.xact_commit + psd.xact_rollback) > 0; -- MEDIUM: Tables with outdated statistics insert into diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 1893e30..27cd5cc 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(16); +SELECT plan(22); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -10,6 +10,33 @@ SELECT ok( 'View executes Current Blocked/Blocking Queries check' ); +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Deadlocks Since Stats Reset'), + 'Function executes Deadlocks Since Stats Reset check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Deadlocks Since Stats Reset'), + 'View executes Deadlocks Since Stats Reset check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Deadlock Rate Per Hour'), + 'Function executes Deadlock Rate Per Hour check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Deadlock Rate Per Hour'), + 'View executes Deadlock Rate Per Hour check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Deadlocks as Percent of Transactions'), + 'Function executes Deadlocks as Percent of Transactions check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Deadlocks as Percent of Transactions'), + 'View executes Deadlocks as Percent of Transactions check' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Outdated Statistics'), 'Function executes Outdated Statistics check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 6f2f774..ce1450b 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -381,6 +381,68 @@ select from bq) union all +-- MEDIUM: Deadlocks since stats reset +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlocks Since Stats Reset' as check_name, + psd.datname as object_name, + 'Deadlocks have been detected since the last PostgreSQL statistics reset' as issue_description, + 'Deadlocks: ' || psd.deadlocks || ', Stats reset: ' || psd.stats_reset as current_value, + 'Capture deadlock details in logs and review transaction ordering to reduce lock cycles' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0 +union all +-- MEDIUM: Deadlock rate per hour +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlock Rate Per Hour' as check_name, + psd.datname as object_name, + 'Deadlocks are occurring often relative to time since stats reset' as issue_description, + 'Deadlocks/hour: ' || round( + psd.deadlocks / GREATEST(EXTRACT(EPOCH FROM (now() - psd.stats_reset)) / 3600.0, 1), + 3 + ) || ', Deadlocks: ' || psd.deadlocks as current_value, + 'Investigate recent lock contention and reduce transaction overlap on the same rows' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0 +union all +-- MEDIUM: Deadlocks compared to transaction volume +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlocks as Percent of Transactions' as check_name, + psd.datname as object_name, + 'Deadlocks are impacting a measurable share of transaction volume' as issue_description, + 'Deadlocks: ' || psd.deadlocks || ', Total transactions: ' || (psd.xact_commit + psd.xact_rollback) || + ', Deadlock percent: ' || round( + 100.0 * psd.deadlocks / NULLIF(psd.xact_commit + psd.xact_rollback, 0), + 4 + ) || '%' as current_value, + 'Review application retry logic and lock acquisition patterns for frequently conflicting transactions' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0 + and (psd.xact_commit + psd.xact_rollback) > 0 +union all -- MEDIUM: Tables with outdated statistics (with s as ( select diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index e60202c..8b00822 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -375,6 +375,68 @@ select from bq) union all +-- MEDIUM: Deadlocks since stats reset +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlocks Since Stats Reset' as check_name, + psd.datname as object_name, + 'Deadlocks have been detected since the last PostgreSQL statistics reset' as issue_description, + 'Deadlocks: ' || psd.deadlocks || ', Stats reset: ' || psd.stats_reset as current_value, + 'Capture deadlock details in logs and review transaction ordering to reduce lock cycles' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0 +union all +-- MEDIUM: Deadlock rate per hour +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlock Rate Per Hour' as check_name, + psd.datname as object_name, + 'Deadlocks are occurring often relative to time since stats reset' as issue_description, + 'Deadlocks/hour: ' || round( + psd.deadlocks / GREATEST(EXTRACT(EPOCH FROM (now() - psd.stats_reset)) / 3600.0, 1), + 3 + ) || ', Deadlocks: ' || psd.deadlocks as current_value, + 'Investigate recent lock contention and reduce transaction overlap on the same rows' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0 +union all +-- MEDIUM: Deadlocks compared to transaction volume +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Deadlocks as Percent of Transactions' as check_name, + psd.datname as object_name, + 'Deadlocks are impacting a measurable share of transaction volume' as issue_description, + 'Deadlocks: ' || psd.deadlocks || ', Total transactions: ' || (psd.xact_commit + psd.xact_rollback) || + ', Deadlock percent: ' || round( + 100.0 * psd.deadlocks / NULLIF(psd.xact_commit + psd.xact_rollback, 0), + 4 + ) || '%' as current_value, + 'Review application retry logic and lock acquisition patterns for frequently conflicting transactions' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ + https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, + 3 as severity_order +from + pg_stat_database psd +where + psd.datname not in ('template0', 'template1') + and psd.deadlocks > 0 + and (psd.xact_commit + psd.xact_rollback) > 0 +union all -- MEDIUM: Tables with outdated statistics (with s as ( select From bdfcac1342efee8ba6065a691bb5aa078d463ea0 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 19:29:43 -0700 Subject: [PATCH 02/14] added documentation from pganalyze --- pgFirstAid.sql | 68 --------------------------------- testing/pgTAP/03_high_tests.sql | 29 +------------- view_pgFirstAid.sql | 62 ------------------------------ view_pgFirstAid_managed.sql | 62 ------------------------------ 4 files changed, 1 insertion(+), 220 deletions(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index af9d439..05795a9 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -421,74 +421,6 @@ select 3 as severity_order from bq; --- MEDIUM: Deadlocks since stats reset -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlocks Since Stats Reset' as check_name, - psd.datname as object_name, - 'Deadlocks have been detected since the last PostgreSQL statistics reset' as issue_description, - 'Deadlocks: ' || psd.deadlocks || ', Stats reset: ' || psd.stats_reset as current_value, - 'Capture deadlock details in logs and review transaction ordering to reduce lock cycles' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0; --- MEDIUM: Deadlock rate per hour -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlock Rate Per Hour' as check_name, - psd.datname as object_name, - 'Deadlocks are occurring often relative to time since stats reset' as issue_description, - 'Deadlocks/hour: ' || round( - psd.deadlocks / GREATEST(EXTRACT(EPOCH FROM (now() - psd.stats_reset)) / 3600.0, 1), - 3 - ) || ', Deadlocks: ' || psd.deadlocks as current_value, - 'Investigate recent lock contention and reduce transaction overlap on the same rows' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0; --- MEDIUM: Deadlocks compared to transaction volume -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlocks as Percent of Transactions' as check_name, - psd.datname as object_name, - 'Deadlocks are impacting a measurable share of transaction volume' as issue_description, - 'Deadlocks: ' || psd.deadlocks || ', Total transactions: ' || (psd.xact_commit + psd.xact_rollback) || - ', Deadlock percent: ' || round( - 100.0 * psd.deadlocks / NULLIF(psd.xact_commit + psd.xact_rollback, 0), - 4 - ) || '%' as current_value, - 'Review application retry logic and lock acquisition patterns for frequently conflicting transactions' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0 - and (psd.xact_commit + psd.xact_rollback) > 0; -- MEDIUM: Tables with outdated statistics insert into diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 27cd5cc..1893e30 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(22); +SELECT plan(16); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -10,33 +10,6 @@ SELECT ok( 'View executes Current Blocked/Blocking Queries check' ); -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Deadlocks Since Stats Reset'), - 'Function executes Deadlocks Since Stats Reset check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Deadlocks Since Stats Reset'), - 'View executes Deadlocks Since Stats Reset check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Deadlock Rate Per Hour'), - 'Function executes Deadlock Rate Per Hour check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Deadlock Rate Per Hour'), - 'View executes Deadlock Rate Per Hour check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Deadlocks as Percent of Transactions'), - 'Function executes Deadlocks as Percent of Transactions check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Deadlocks as Percent of Transactions'), - 'View executes Deadlocks as Percent of Transactions check' -); - SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Outdated Statistics'), 'Function executes Outdated Statistics check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index ce1450b..6f2f774 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -381,68 +381,6 @@ select from bq) union all --- MEDIUM: Deadlocks since stats reset -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlocks Since Stats Reset' as check_name, - psd.datname as object_name, - 'Deadlocks have been detected since the last PostgreSQL statistics reset' as issue_description, - 'Deadlocks: ' || psd.deadlocks || ', Stats reset: ' || psd.stats_reset as current_value, - 'Capture deadlock details in logs and review transaction ordering to reduce lock cycles' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0 -union all --- MEDIUM: Deadlock rate per hour -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlock Rate Per Hour' as check_name, - psd.datname as object_name, - 'Deadlocks are occurring often relative to time since stats reset' as issue_description, - 'Deadlocks/hour: ' || round( - psd.deadlocks / GREATEST(EXTRACT(EPOCH FROM (now() - psd.stats_reset)) / 3600.0, 1), - 3 - ) || ', Deadlocks: ' || psd.deadlocks as current_value, - 'Investigate recent lock contention and reduce transaction overlap on the same rows' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0 -union all --- MEDIUM: Deadlocks compared to transaction volume -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlocks as Percent of Transactions' as check_name, - psd.datname as object_name, - 'Deadlocks are impacting a measurable share of transaction volume' as issue_description, - 'Deadlocks: ' || psd.deadlocks || ', Total transactions: ' || (psd.xact_commit + psd.xact_rollback) || - ', Deadlock percent: ' || round( - 100.0 * psd.deadlocks / NULLIF(psd.xact_commit + psd.xact_rollback, 0), - 4 - ) || '%' as current_value, - 'Review application retry logic and lock acquisition patterns for frequently conflicting transactions' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0 - and (psd.xact_commit + psd.xact_rollback) > 0 -union all -- MEDIUM: Tables with outdated statistics (with s as ( select diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index 8b00822..e60202c 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -375,68 +375,6 @@ select from bq) union all --- MEDIUM: Deadlocks since stats reset -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlocks Since Stats Reset' as check_name, - psd.datname as object_name, - 'Deadlocks have been detected since the last PostgreSQL statistics reset' as issue_description, - 'Deadlocks: ' || psd.deadlocks || ', Stats reset: ' || psd.stats_reset as current_value, - 'Capture deadlock details in logs and review transaction ordering to reduce lock cycles' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0 -union all --- MEDIUM: Deadlock rate per hour -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlock Rate Per Hour' as check_name, - psd.datname as object_name, - 'Deadlocks are occurring often relative to time since stats reset' as issue_description, - 'Deadlocks/hour: ' || round( - psd.deadlocks / GREATEST(EXTRACT(EPOCH FROM (now() - psd.stats_reset)) / 3600.0, 1), - 3 - ) || ', Deadlocks: ' || psd.deadlocks as current_value, - 'Investigate recent lock contention and reduce transaction overlap on the same rows' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0 -union all --- MEDIUM: Deadlocks compared to transaction volume -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Deadlocks as Percent of Transactions' as check_name, - psd.datname as object_name, - 'Deadlocks are impacting a measurable share of transaction volume' as issue_description, - 'Deadlocks: ' || psd.deadlocks || ', Total transactions: ' || (psd.xact_commit + psd.xact_rollback) || - ', Deadlock percent: ' || round( - 100.0 * psd.deadlocks / NULLIF(psd.xact_commit + psd.xact_rollback, 0), - 4 - ) || '%' as current_value, - 'Review application retry logic and lock acquisition patterns for frequently conflicting transactions' as recommended_action, - 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW \ - https://pganalyze.com/blog/postgresql-log-monitoring-101-deadlocks-checkpoints-blocked-queries' as documentation_link, - 3 as severity_order -from - pg_stat_database psd -where - psd.datname not in ('template0', 'template1') - and psd.deadlocks > 0 - and (psd.xact_commit + psd.xact_rollback) > 0 -union all -- MEDIUM: Tables with outdated statistics (with s as ( select From 963ca38c5ea2de5c094b0dc4cedaf17076ab51e5 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 19:37:25 -0700 Subject: [PATCH 03/14] adding top 10 expensive queries --- pgFirstAid.sql | 44 ++++++++++++++++++++++++++++++++- testing/pgTAP/03_high_tests.sql | 11 ++++++++- view_pgFirstAid.sql | 40 ++++++++++++++++++++++++++++++ view_pgFirstAid_managed.sql | 40 ++++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 2 deletions(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 05795a9..ac140b4 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -674,8 +674,50 @@ where and now() - query_start > interval '5 minutes' order by (now() - query_start) desc; +-- MEDIUM: Top 10 expensive active queries by runtime +with eq as ( +select + pgs.pid, + pgs.usename, + pgs.datname, + pgs.client_addr, + now() - pgs.query_start as runtime, + pgs.query +from + pg_stat_activity pgs +where + pgs.state = 'active' + and pgs.query_start is not null + and pgs.pid <> pg_backend_pid() + and now() - pgs.query_start > interval '30 seconds' +order by + runtime desc +limit 10) +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Expensive Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || eq.pid::text, + 'usename: ' || eq.usename, + 'datname: ' || eq.datname, + 'client_address: ' || coalesce(eq.client_addr::text, 'local'), + 'runtime: ' || to_char(eq.runtime, 'HH24:MI:SS') + ) as object_name, + 'Top 10 active queries running longer than 30 seconds, ordered by runtime. Long-running active queries can signal lock waits, missing indexes, or inefficient plans' as issue_description, + left(regexp_replace(eq.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Review these queries with EXPLAIN (ANALYZE, BUFFERS) and reduce lock waits or full scans' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + eq; -- LOW: Missing indexes on foreign keys - insert + insert into health_results select diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 1893e30..0e68453 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(16); +SELECT plan(18); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -46,6 +46,15 @@ SELECT ok( 'View executes Excessive Sequential Scans check' ); +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Expensive Active Queries'), + 'Function executes Top 10 Expensive Active Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Expensive Active Queries'), + 'View executes Top 10 Expensive Active Queries check' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Table with more than 50 columns'), 'Function executes Table with more than 50 columns check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 6f2f774..865a953 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -614,6 +614,46 @@ where state = 'active' and now() - query_start > interval '5 minutes' union all +-- MEDIUM: Top 10 expensive active queries by runtime +(with eq as ( +select + pgs.pid, + pgs.usename, + pgs.datname, + pgs.client_addr, + now() - pgs.query_start as runtime, + pgs.query +from + pg_stat_activity pgs +where + pgs.state = 'active' + and pgs.query_start is not null + and pgs.pid <> pg_backend_pid() + and now() - pgs.query_start > interval '30 seconds' +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Expensive Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || eq.pid::text, + 'usename: ' || eq.usename, + 'datname: ' || eq.datname, + 'client_address: ' || coalesce(eq.client_addr::text, 'local'), + 'runtime: ' || to_char(eq.runtime, 'HH24:MI:SS') + ) as object_name, + 'Top 10 active queries running longer than 30 seconds, ordered by runtime. Long-running active queries can signal lock waits, missing indexes, or inefficient plans' as issue_description, + left(regexp_replace(eq.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Review these queries with EXPLAIN (ANALYZE, BUFFERS) and reduce lock waits or full scans' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + eq) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index e60202c..404432e 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -608,6 +608,46 @@ where state = 'active' and now() - query_start > interval '5 minutes' union all +-- MEDIUM: Top 10 expensive active queries by runtime +(with eq as ( +select + pgs.pid, + pgs.usename, + pgs.datname, + pgs.client_addr, + now() - pgs.query_start as runtime, + pgs.query +from + pg_stat_activity pgs +where + pgs.state = 'active' + and pgs.query_start is not null + and pgs.pid <> pg_backend_pid() + and now() - pgs.query_start > interval '30 seconds' +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Expensive Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || eq.pid::text, + 'usename: ' || eq.usename, + 'datname: ' || eq.datname, + 'client_address: ' || coalesce(eq.client_addr::text, 'local'), + 'runtime: ' || to_char(eq.runtime, 'HH24:MI:SS') + ) as object_name, + 'Top 10 active queries running longer than 30 seconds, ordered by runtime. Long-running active queries can signal lock waits, missing indexes, or inefficient plans' as issue_description, + left(regexp_replace(eq.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Review these queries with EXPLAIN (ANALYZE, BUFFERS) and reduce lock waits or full scans' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + eq) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT From 4bab234a27ddf95090ff30ec0bd0272553592e11 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 19:43:31 -0700 Subject: [PATCH 04/14] resolving pre-mature workflow executions --- .github/workflows/integration-pg-matrix.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-pg-matrix.yml b/.github/workflows/integration-pg-matrix.yml index 8adc7eb..0238b78 100644 --- a/.github/workflows/integration-pg-matrix.yml +++ b/.github/workflows/integration-pg-matrix.yml @@ -2,6 +2,7 @@ name: Python + pgTAP Integration (PG15-PG18) on: pull_request: + types: [opened] paths: - pgFirstAid.sql - view_pgFirstAid.sql @@ -9,7 +10,6 @@ on: - testing/integration/** - testing/pgTAP/** - .github/workflows/integration-pg-matrix.yml - workflow_dispatch: concurrency: group: integration-${{ github.ref }} From 558d8756512aac83e2202e16a3e0a429a6184266 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 20:04:39 -0700 Subject: [PATCH 05/14] high impact, quick win query health checks --- pgFirstAid.sql | 96 +++++++++++++++++++++++++++++++++ testing/pgTAP/03_high_tests.sql | 29 +++++++++- view_pgFirstAid.sql | 90 +++++++++++++++++++++++++++++++ view_pgFirstAid_managed.sql | 90 +++++++++++++++++++++++++++++++ 4 files changed, 304 insertions(+), 1 deletion(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index ac140b4..38d2060 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -716,6 +716,102 @@ select 3 as severity_order from eq; +-- MEDIUM: Top 10 queries by total execution time +with pss as ( +select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows +from + pg_stat_statements +where + calls > 0 +order by + total_exec_time desc +limit 10) +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pss; +-- MEDIUM: High mean execution time queries +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and pss.mean_exec_time > 100 +order by + pss.mean_exec_time desc +limit 10; +-- MEDIUM: Top 10 queries with temp block spills +with pss as ( +select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time +from + pg_stat_statements +where + (temp_blks_read + temp_blks_written) > 0 +order by + (temp_blks_read + temp_blks_written) desc +limit 10) +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pss; -- LOW: Missing indexes on foreign keys insert into diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 0e68453..53ad70a 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(18); +SELECT plan(24); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -55,6 +55,33 @@ SELECT ok( 'View executes Top 10 Expensive Active Queries check' ); +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Total Execution Time'), + 'Function executes Top 10 Queries by Total Execution Time check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Total Execution Time'), + 'View executes Top 10 Queries by Total Execution Time check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Mean Execution Time Queries'), + 'Function executes High Mean Execution Time Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Mean Execution Time Queries'), + 'View executes High Mean Execution Time Queries check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Temp Block Spills'), + 'Function executes Top 10 Queries by Temp Block Spills check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Temp Block Spills'), + 'View executes Top 10 Queries by Temp Block Spills check' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Table with more than 50 columns'), 'Function executes Table with more than 50 columns check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 865a953..ed45a4c 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -654,6 +654,96 @@ select from eq) union all +-- MEDIUM: Top 10 queries by total execution time +(with pss as ( +select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows +from + pg_stat_statements +where + calls > 0 +order by + total_exec_time desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pss) +union all +-- MEDIUM: High mean execution time queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and pss.mean_exec_time > 100 +order by + pss.mean_exec_time desc +limit 10 +union all +-- MEDIUM: Top 10 queries with temp block spills +(with pss as ( +select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time +from + pg_stat_statements +where + (temp_blks_read + temp_blks_written) > 0 +order by + (temp_blks_read + temp_blks_written) desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pss) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index 404432e..ac79967 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -648,6 +648,96 @@ select from eq) union all +-- MEDIUM: Top 10 queries by total execution time +(with pss as ( +select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows +from + pg_stat_statements +where + calls > 0 +order by + total_exec_time desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pss) +union all +-- MEDIUM: High mean execution time queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and pss.mean_exec_time > 100 +order by + pss.mean_exec_time desc +limit 10 +union all +-- MEDIUM: Top 10 queries with temp block spills +(with pss as ( +select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time +from + pg_stat_statements +where + (temp_blks_read + temp_blks_written) > 0 +order by + (temp_blks_read + temp_blks_written) desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pss) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT From 54e76a725c6b9b50ea61483f12fba1227d33bfdb Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 20:11:39 -0700 Subject: [PATCH 06/14] high impact, quick win query health checks part 2 --- pgFirstAid.sql | 101 ++++++++++++++++++++++++++++++++ testing/pgTAP/03_high_tests.sql | 29 ++++++++- view_pgFirstAid.sql | 95 ++++++++++++++++++++++++++++++ view_pgFirstAid_managed.sql | 95 ++++++++++++++++++++++++++++++ 4 files changed, 319 insertions(+), 1 deletion(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 38d2060..7761d76 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -812,6 +812,107 @@ select 3 as severity_order from pss; +-- MEDIUM: Low cache hit ratio queries +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 +order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc +limit 10; +-- MEDIUM: High runtime variance queries +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time +order by + pss.stddev_exec_time desc +limit 10; +-- MEDIUM: Lock-wait-heavy active queries +with lw as ( +select + pid, + usename, + datname, + client_addr, + wait_event, + query_start, + now() - query_start as runtime, + query +from + pg_stat_activity +where + state = 'active' + and wait_event_type = 'Lock' + and query_start is not null + and now() - query_start > interval '30 seconds' + and pid <> pg_backend_pid() +order by + runtime desc +limit 10) +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Lock-Wait-Heavy Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || lw.pid::text, + 'usename: ' || lw.usename, + 'datname: ' || lw.datname, + 'client_address: ' || coalesce(lw.client_addr::text, 'local'), + 'wait_event: ' || coalesce(lw.wait_event, 'unknown'), + 'runtime: ' || to_char(lw.runtime, 'HH24:MI:SS') + ) as object_name, + 'Active queries waiting on locks for extended time can block throughput and cause cascading latency' as issue_description, + left(regexp_replace(lw.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Reduce transaction duration, enforce consistent lock ordering, and investigate blockers first' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/explicit-locking.html' as documentation_link, + 3 as severity_order +from + lw; -- LOW: Missing indexes on foreign keys insert into diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 53ad70a..eea6928 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(24); +SELECT plan(30); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -82,6 +82,33 @@ SELECT ok( 'View executes Top 10 Queries by Temp Block Spills check' ); +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Low Cache Hit Ratio Queries'), + 'Function executes Low Cache Hit Ratio Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Low Cache Hit Ratio Queries'), + 'View executes Low Cache Hit Ratio Queries check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Runtime Variance Queries'), + 'Function executes High Runtime Variance Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Runtime Variance Queries'), + 'View executes High Runtime Variance Queries check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Lock-Wait-Heavy Active Queries'), + 'Function executes Lock-Wait-Heavy Active Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Lock-Wait-Heavy Active Queries'), + 'View executes Lock-Wait-Heavy Active Queries check' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Table with more than 50 columns'), 'Function executes Table with more than 50 columns check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index ed45a4c..2de9485 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -744,6 +744,101 @@ select from pss) union all +-- MEDIUM: Low cache hit ratio queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 +order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc +limit 10 +union all +-- MEDIUM: High runtime variance queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time +order by + pss.stddev_exec_time desc +limit 10 +union all +-- MEDIUM: Lock-wait-heavy active queries +(with lw as ( +select + pid, + usename, + datname, + client_addr, + wait_event, + query_start, + now() - query_start as runtime, + query +from + pg_stat_activity +where + state = 'active' + and wait_event_type = 'Lock' + and query_start is not null + and now() - query_start > interval '30 seconds' + and pid <> pg_backend_pid() +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Lock-Wait-Heavy Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || lw.pid::text, + 'usename: ' || lw.usename, + 'datname: ' || lw.datname, + 'client_address: ' || coalesce(lw.client_addr::text, 'local'), + 'wait_event: ' || coalesce(lw.wait_event, 'unknown'), + 'runtime: ' || to_char(lw.runtime, 'HH24:MI:SS') + ) as object_name, + 'Active queries waiting on locks for extended time can block throughput and cause cascading latency' as issue_description, + left(regexp_replace(lw.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Reduce transaction duration, enforce consistent lock ordering, and investigate blockers first' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/explicit-locking.html' as documentation_link, + 3 as severity_order +from + lw) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index ac79967..1f2fb98 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -738,6 +738,101 @@ select from pss) union all +-- MEDIUM: Low cache hit ratio queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 +order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc +limit 10 +union all +-- MEDIUM: High runtime variance queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time +order by + pss.stddev_exec_time desc +limit 10 +union all +-- MEDIUM: Lock-wait-heavy active queries +(with lw as ( +select + pid, + usename, + datname, + client_addr, + wait_event, + query_start, + now() - query_start as runtime, + query +from + pg_stat_activity +where + state = 'active' + and wait_event_type = 'Lock' + and query_start is not null + and now() - query_start > interval '30 seconds' + and pid <> pg_backend_pid() +order by + runtime desc +limit 10) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Lock-Wait-Heavy Active Queries' as check_name, + concat_ws(' | ', + 'pid: ' || lw.pid::text, + 'usename: ' || lw.usename, + 'datname: ' || lw.datname, + 'client_address: ' || coalesce(lw.client_addr::text, 'local'), + 'wait_event: ' || coalesce(lw.wait_event, 'unknown'), + 'runtime: ' || to_char(lw.runtime, 'HH24:MI:SS') + ) as object_name, + 'Active queries waiting on locks for extended time can block throughput and cause cascading latency' as issue_description, + left(regexp_replace(lw.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Reduce transaction duration, enforce consistent lock ordering, and investigate blockers first' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/explicit-locking.html' as documentation_link, + 3 as severity_order +from + lw) +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT From 8790fe551047fc6c70b458e0c6d245160d235674 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 21:15:09 -0700 Subject: [PATCH 07/14] more high level checks --- pgFirstAid.sql | 79 +++++++++++++++++++++++++++++++++ testing/pgTAP/03_high_tests.sql | 29 +++++++++++- view_pgFirstAid.sql | 73 ++++++++++++++++++++++++++++++ view_pgFirstAid_managed.sql | 73 ++++++++++++++++++++++++++++++ 4 files changed, 253 insertions(+), 1 deletion(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 7761d76..978fd86 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -913,6 +913,85 @@ select 3 as severity_order from lw; +-- MEDIUM: High calls low value queries +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 +order by + pss.calls desc +limit 10; +-- MEDIUM: High rows per call queries +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 +order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc +limit 10; +-- MEDIUM: High shared block reads per call queries +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 +order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc +limit 10; -- LOW: Missing indexes on foreign keys insert into diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index eea6928..c966d00 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(30); +SELECT plan(36); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -109,6 +109,33 @@ SELECT ok( 'View executes Lock-Wait-Heavy Active Queries check' ); +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Calls Low Value Queries'), + 'Function executes High Calls Low Value Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Calls Low Value Queries'), + 'View executes High Calls Low Value Queries check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Rows Per Call Queries'), + 'Function executes High Rows Per Call Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Rows Per Call Queries'), + 'View executes High Rows Per Call Queries check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Shared Block Reads Per Call Queries'), + 'Function executes High Shared Block Reads Per Call Queries check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Shared Block Reads Per Call Queries'), + 'View executes High Shared Block Reads Per Call Queries check' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Table with more than 50 columns'), 'Function executes Table with more than 50 columns check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 2de9485..6585d1d 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -839,6 +839,79 @@ select from lw) union all +-- MEDIUM: High calls low value queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 +order by + pss.calls desc +limit 10 +union all +-- MEDIUM: High rows per call queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 +order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc +limit 10 +union all +-- MEDIUM: High shared block reads per call queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 +order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc +limit 10 +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index 1f2fb98..f9fbb5d 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -833,6 +833,79 @@ select from lw) union all +-- MEDIUM: High calls low value queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 +order by + pss.calls desc +limit 10 +union all +-- MEDIUM: High rows per call queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 +order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc +limit 10 +union all +-- MEDIUM: High shared block reads per call queries +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 +order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc +limit 10 +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT From b2bbe010494c5c8fef92dd9ac47b948168e283b8 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 21:25:50 -0700 Subject: [PATCH 08/14] medium/low query health checks --- pgFirstAid.sql | 60 +++++++++++++++++++++++++++++++++ testing/pgTAP/03_high_tests.sql | 20 ++++++++++- view_pgFirstAid.sql | 56 ++++++++++++++++++++++++++++++ view_pgFirstAid_managed.sql | 56 ++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+), 1 deletion(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 978fd86..2e52cda 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -992,6 +992,66 @@ where order by (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc limit 10; +-- MEDIUM: Top queries by WAL bytes per call (PG16+) +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 +order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc +limit 10; +-- MEDIUM: Idle in transaction over 5 minutes +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Idle In Transaction Over 5 Minutes' as check_name, + concat_ws(' | ', + 'pid: ' || psa.pid::text, + 'usename: ' || psa.usename, + 'datname: ' || psa.datname, + 'client_address: ' || coalesce(psa.client_addr::text, 'local'), + 'idle_duration: ' || to_char(now() - psa.state_change, 'HH24:MI:SS') + ) as object_name, + 'Sessions left idle in transaction hold snapshots and locks longer than necessary, which can hurt query performance and vacuum progress' as issue_description, + left(regexp_replace(psa.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Commit or rollback promptly and move application processing outside transaction boundaries' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/routine-vacuuming.html' as documentation_link, + 3 as severity_order +from + pg_stat_activity psa +where + psa.state = 'idle in transaction' + and psa.state_change is not null + and now() - psa.state_change > interval '5 minutes' + and psa.pid <> pg_backend_pid() +order by + now() - psa.state_change desc; -- LOW: Missing indexes on foreign keys insert into diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index c966d00..129a465 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(36); +SELECT plan(40); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -136,6 +136,24 @@ SELECT ok( 'View executes High Shared Block Reads Per Call Queries check' ); +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top Queries by WAL Bytes Per Call'), + 'Function executes Top Queries by WAL Bytes Per Call check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top Queries by WAL Bytes Per Call'), + 'View executes Top Queries by WAL Bytes Per Call check' +); + +SELECT ok( + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Idle In Transaction Over 5 Minutes'), + 'Function executes Idle In Transaction Over 5 Minutes check' +); +SELECT ok( + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Idle In Transaction Over 5 Minutes'), + 'View executes Idle In Transaction Over 5 Minutes check' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Table with more than 50 columns'), 'Function executes Table with more than 50 columns check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 6585d1d..6d21a5f 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -912,6 +912,62 @@ order by (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc limit 10 union all +-- MEDIUM: Top queries by WAL bytes per call (PG16+) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 +order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc +limit 10 +union all +-- MEDIUM: Idle in transaction over 5 minutes +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Idle In Transaction Over 5 Minutes' as check_name, + concat_ws(' | ', + 'pid: ' || psa.pid::text, + 'usename: ' || psa.usename, + 'datname: ' || psa.datname, + 'client_address: ' || coalesce(psa.client_addr::text, 'local'), + 'idle_duration: ' || to_char(now() - psa.state_change, 'HH24:MI:SS') + ) as object_name, + 'Sessions left idle in transaction hold snapshots and locks longer than necessary, which can hurt query performance and vacuum progress' as issue_description, + left(regexp_replace(psa.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Commit or rollback promptly and move application processing outside transaction boundaries' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/routine-vacuuming.html' as documentation_link, + 3 as severity_order +from + pg_stat_activity psa +where + psa.state = 'idle in transaction' + and psa.state_change is not null + and now() - psa.state_change > interval '5 minutes' + and psa.pid <> pg_backend_pid() +order by + now() - psa.state_change desc +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index f9fbb5d..77383dd 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -906,6 +906,62 @@ order by (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc limit 10 union all +-- MEDIUM: Top queries by WAL bytes per call (PG16+) +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order +from + pg_stat_statements pss +where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 +order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc +limit 10 +union all +-- MEDIUM: Idle in transaction over 5 minutes +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Idle In Transaction Over 5 Minutes' as check_name, + concat_ws(' | ', + 'pid: ' || psa.pid::text, + 'usename: ' || psa.usename, + 'datname: ' || psa.datname, + 'client_address: ' || coalesce(psa.client_addr::text, 'local'), + 'idle_duration: ' || to_char(now() - psa.state_change, 'HH24:MI:SS') + ) as object_name, + 'Sessions left idle in transaction hold snapshots and locks longer than necessary, which can hurt query performance and vacuum progress' as issue_description, + left(regexp_replace(psa.query, E'[\n\r\t]+', ' ', 'g'), 500) as current_value, + 'Commit or rollback promptly and move application processing outside transaction boundaries' as recommended_action, + 'https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW \ + https://www.postgresql.org/docs/current/routine-vacuuming.html' as documentation_link, + 3 as severity_order +from + pg_stat_activity psa +where + psa.state = 'idle in transaction' + and psa.state_change is not null + and now() - psa.state_change > interval '5 minutes' + and psa.pid <> pg_backend_pid() +order by + now() - psa.state_change desc +union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( SELECT From 1870cc7f880e7b6f955f06e22100d688c53ea065 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 22:07:40 -0700 Subject: [PATCH 09/14] revert changes, add pgstat check --- pgFirstAid.sql | 287 +++----------------------------- testing/pgTAP/03_high_tests.sql | 82 +-------- view_pgFirstAid.sql | 271 +++--------------------------- view_pgFirstAid_managed.sql | 271 +++--------------------------- 4 files changed, 83 insertions(+), 828 deletions(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 2e52cda..8e9b209 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -674,6 +674,32 @@ where and now() - query_start > interval '5 minutes' order by (now() - query_start) desc; +-- MEDIUM: pg_stat_statements extension missing +insert + into + health_results +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'pg_stat_statements Extension Missing' as check_name, + 'pg_stat_statements' as object_name, + 'pg_stat_statements is not installed, so query fingerprint and workload-level performance checks are unavailable' as issue_description, + 'Extension not found in pg_extension' as current_value, + 'Self-hosted: add pg_stat_statements to shared_preload_libraries, restart PostgreSQL, then run CREATE EXTENSION pg_stat_statements; AWS RDS: add pg_stat_statements to the parameter group shared_preload_libraries, reboot, then CREATE EXTENSION; GCP Cloud SQL: enable cloudsql.enable_pg_stat_statements, restart if required, then CREATE EXTENSION; Azure Database for PostgreSQL: add pg_stat_statements to shared_preload_libraries, restart, then CREATE EXTENSION' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Extensions.html \ + https://cloud.google.com/sql/docs/postgres/flags \ + https://learn.microsoft.com/azure/postgresql/flexible-server/concepts-server-parameters' as documentation_link, + 3 as severity_order +where + not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements' + ); -- MEDIUM: Top 10 expensive active queries by runtime with eq as ( select @@ -716,158 +742,6 @@ select 3 as severity_order from eq; --- MEDIUM: Top 10 queries by total execution time -with pss as ( -select - queryid, - query, - calls, - total_exec_time, - mean_exec_time, - rows -from - pg_stat_statements -where - calls > 0 -order by - total_exec_time desc -limit 10) -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top 10 Queries by Total Execution Time' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, - 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pss; --- MEDIUM: High mean execution time queries -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Mean Execution Time Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and pss.mean_exec_time > 100 -order by - pss.mean_exec_time desc -limit 10; --- MEDIUM: Top 10 queries with temp block spills -with pss as ( -select - queryid, - query, - calls, - temp_blks_read, - temp_blks_written, - total_exec_time -from - pg_stat_statements -where - (temp_blks_read + temp_blks_written) > 0 -order by - (temp_blks_read + temp_blks_written) desc -limit 10) -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top 10 Queries by Temp Block Spills' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, - 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || - ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || - round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pss; --- MEDIUM: Low cache hit ratio queries -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Low Cache Hit Ratio Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, - 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( - 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), - 2 - ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.shared_blks_hit + pss.shared_blks_read) > 0 - and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 -order by - (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc -limit 10; --- MEDIUM: High runtime variance queries -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Runtime Variance Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || - left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/routine-vacuuming.html \ - https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and pss.stddev_exec_time > pss.mean_exec_time -order by - pss.stddev_exec_time desc -limit 10; -- MEDIUM: Lock-wait-heavy active queries with lw as ( select @@ -913,115 +787,6 @@ select 3 as severity_order from lw; --- MEDIUM: High calls low value queries -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Calls Low Value Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 5000 - and pss.mean_exec_time <= 2 - and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 -order by - pss.calls desc -limit 10; --- MEDIUM: High rows per call queries -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Rows Per Call Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, - 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || - ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/queries-limit.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 -order by - (pss.rows::numeric / NULLIF(pss.calls, 0)) desc -limit 10; --- MEDIUM: High shared block reads per call queries -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Shared Block Reads Per Call Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, - 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || - ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 -order by - (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc -limit 10; --- MEDIUM: Top queries by WAL bytes per call (PG16+) -insert - into - health_results -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top Queries by WAL Bytes Per Call' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, - 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( - ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), - 2 - ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || - ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/wal-intro.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 - and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 -order by - ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc -limit 10; -- MEDIUM: Idle in transaction over 5 minutes insert into diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 129a465..5326b8d 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(40); +SELECT plan(24); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -56,48 +56,12 @@ SELECT ok( ); SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Total Execution Time'), - 'Function executes Top 10 Queries by Total Execution Time check' + (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'pg_stat_statements Extension Missing'), + 'Function executes pg_stat_statements Extension Missing check' ); SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Total Execution Time'), - 'View executes Top 10 Queries by Total Execution Time check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Mean Execution Time Queries'), - 'Function executes High Mean Execution Time Queries check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Mean Execution Time Queries'), - 'View executes High Mean Execution Time Queries check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Temp Block Spills'), - 'Function executes Top 10 Queries by Temp Block Spills check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Temp Block Spills'), - 'View executes Top 10 Queries by Temp Block Spills check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Low Cache Hit Ratio Queries'), - 'Function executes Low Cache Hit Ratio Queries check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Low Cache Hit Ratio Queries'), - 'View executes Low Cache Hit Ratio Queries check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Runtime Variance Queries'), - 'Function executes High Runtime Variance Queries check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Runtime Variance Queries'), - 'View executes High Runtime Variance Queries check' + (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'pg_stat_statements Extension Missing'), + 'View executes pg_stat_statements Extension Missing check' ); SELECT ok( @@ -109,42 +73,6 @@ SELECT ok( 'View executes Lock-Wait-Heavy Active Queries check' ); -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Calls Low Value Queries'), - 'Function executes High Calls Low Value Queries check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Calls Low Value Queries'), - 'View executes High Calls Low Value Queries check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Rows Per Call Queries'), - 'Function executes High Rows Per Call Queries check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Rows Per Call Queries'), - 'View executes High Rows Per Call Queries check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Shared Block Reads Per Call Queries'), - 'Function executes High Shared Block Reads Per Call Queries check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Shared Block Reads Per Call Queries'), - 'View executes High Shared Block Reads Per Call Queries check' -); - -SELECT ok( - (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top Queries by WAL Bytes Per Call'), - 'Function executes Top Queries by WAL Bytes Per Call check' -); -SELECT ok( - (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top Queries by WAL Bytes Per Call'), - 'View executes Top Queries by WAL Bytes Per Call check' -); - SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Idle In Transaction Over 5 Minutes'), 'Function executes Idle In Transaction Over 5 Minutes check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 6d21a5f..06e6af1 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -614,6 +614,30 @@ where state = 'active' and now() - query_start > interval '5 minutes' union all +-- MEDIUM: pg_stat_statements extension missing +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'pg_stat_statements Extension Missing' as check_name, + 'pg_stat_statements' as object_name, + 'pg_stat_statements is not installed, so query fingerprint and workload-level performance checks are unavailable' as issue_description, + 'Extension not found in pg_extension' as current_value, + 'Self-hosted: add pg_stat_statements to shared_preload_libraries, restart PostgreSQL, then run CREATE EXTENSION pg_stat_statements; AWS RDS: add pg_stat_statements to the parameter group shared_preload_libraries, reboot, then CREATE EXTENSION; GCP Cloud SQL: enable cloudsql.enable_pg_stat_statements, restart if required, then CREATE EXTENSION; Azure Database for PostgreSQL: add pg_stat_statements to shared_preload_libraries, restart, then CREATE EXTENSION' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Extensions.html \ + https://cloud.google.com/sql/docs/postgres/flags \ + https://learn.microsoft.com/azure/postgresql/flexible-server/concepts-server-parameters' as documentation_link, + 3 as severity_order +where + not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements' + ) +union all -- MEDIUM: Top 10 expensive active queries by runtime (with eq as ( select @@ -654,148 +678,6 @@ select from eq) union all --- MEDIUM: Top 10 queries by total execution time -(with pss as ( -select - queryid, - query, - calls, - total_exec_time, - mean_exec_time, - rows -from - pg_stat_statements -where - calls > 0 -order by - total_exec_time desc -limit 10) -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top 10 Queries by Total Execution Time' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, - 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pss) -union all --- MEDIUM: High mean execution time queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Mean Execution Time Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and pss.mean_exec_time > 100 -order by - pss.mean_exec_time desc -limit 10 -union all --- MEDIUM: Top 10 queries with temp block spills -(with pss as ( -select - queryid, - query, - calls, - temp_blks_read, - temp_blks_written, - total_exec_time -from - pg_stat_statements -where - (temp_blks_read + temp_blks_written) > 0 -order by - (temp_blks_read + temp_blks_written) desc -limit 10) -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top 10 Queries by Temp Block Spills' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, - 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || - ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || - round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pss) -union all --- MEDIUM: Low cache hit ratio queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Low Cache Hit Ratio Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, - 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( - 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), - 2 - ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.shared_blks_hit + pss.shared_blks_read) > 0 - and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 -order by - (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc -limit 10 -union all --- MEDIUM: High runtime variance queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Runtime Variance Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || - left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/routine-vacuuming.html \ - https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and pss.stddev_exec_time > pss.mean_exec_time -order by - pss.stddev_exec_time desc -limit 10 -union all -- MEDIUM: Lock-wait-heavy active queries (with lw as ( select @@ -839,109 +721,8 @@ select from lw) union all --- MEDIUM: High calls low value queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Calls Low Value Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 5000 - and pss.mean_exec_time <= 2 - and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 -order by - pss.calls desc -limit 10 -union all --- MEDIUM: High rows per call queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Rows Per Call Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, - 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || - ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/queries-limit.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 -order by - (pss.rows::numeric / NULLIF(pss.calls, 0)) desc -limit 10 -union all --- MEDIUM: High shared block reads per call queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Shared Block Reads Per Call Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, - 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || - ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 -order by - (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc -limit 10 -union all --- MEDIUM: Top queries by WAL bytes per call (PG16+) -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top Queries by WAL Bytes Per Call' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, - 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( - ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), - 2 - ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || - ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/wal-intro.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 - and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 -order by - ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc -limit 10 -union all -- MEDIUM: Idle in transaction over 5 minutes -select +(select 'MEDIUM' as severity, 'Query Health' as category, 'Idle In Transaction Over 5 Minutes' as check_name, @@ -966,7 +747,7 @@ where and now() - psa.state_change > interval '5 minutes' and psa.pid <> pg_backend_pid() order by - now() - psa.state_change desc + now() - psa.state_change desc) union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index 77383dd..8aee3e4 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -608,6 +608,30 @@ where state = 'active' and now() - query_start > interval '5 minutes' union all +-- MEDIUM: pg_stat_statements extension missing +select + 'MEDIUM' as severity, + 'Query Health' as category, + 'pg_stat_statements Extension Missing' as check_name, + 'pg_stat_statements' as object_name, + 'pg_stat_statements is not installed, so query fingerprint and workload-level performance checks are unavailable' as issue_description, + 'Extension not found in pg_extension' as current_value, + 'Self-hosted: add pg_stat_statements to shared_preload_libraries, restart PostgreSQL, then run CREATE EXTENSION pg_stat_statements; AWS RDS: add pg_stat_statements to the parameter group shared_preload_libraries, reboot, then CREATE EXTENSION; GCP Cloud SQL: enable cloudsql.enable_pg_stat_statements, restart if required, then CREATE EXTENSION; Azure Database for PostgreSQL: add pg_stat_statements to shared_preload_libraries, restart, then CREATE EXTENSION' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Extensions.html \ + https://cloud.google.com/sql/docs/postgres/flags \ + https://learn.microsoft.com/azure/postgresql/flexible-server/concepts-server-parameters' as documentation_link, + 3 as severity_order +where + not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements' + ) +union all -- MEDIUM: Top 10 expensive active queries by runtime (with eq as ( select @@ -648,148 +672,6 @@ select from eq) union all --- MEDIUM: Top 10 queries by total execution time -(with pss as ( -select - queryid, - query, - calls, - total_exec_time, - mean_exec_time, - rows -from - pg_stat_statements -where - calls > 0 -order by - total_exec_time desc -limit 10) -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top 10 Queries by Total Execution Time' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, - 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pss) -union all --- MEDIUM: High mean execution time queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Mean Execution Time Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and pss.mean_exec_time > 100 -order by - pss.mean_exec_time desc -limit 10 -union all --- MEDIUM: Top 10 queries with temp block spills -(with pss as ( -select - queryid, - query, - calls, - temp_blks_read, - temp_blks_written, - total_exec_time -from - pg_stat_statements -where - (temp_blks_read + temp_blks_written) > 0 -order by - (temp_blks_read + temp_blks_written) desc -limit 10) -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top 10 Queries by Temp Block Spills' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, - 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || - ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || - round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pss) -union all --- MEDIUM: Low cache hit ratio queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Low Cache Hit Ratio Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, - 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( - 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), - 2 - ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.shared_blks_hit + pss.shared_blks_read) > 0 - and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 -order by - (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc -limit 10 -union all --- MEDIUM: High runtime variance queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Runtime Variance Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || - left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/routine-vacuuming.html \ - https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and pss.stddev_exec_time > pss.mean_exec_time -order by - pss.stddev_exec_time desc -limit 10 -union all -- MEDIUM: Lock-wait-heavy active queries (with lw as ( select @@ -833,109 +715,8 @@ select from lw) union all --- MEDIUM: High calls low value queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Calls Low Value Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, - 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || - ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || - ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 5000 - and pss.mean_exec_time <= 2 - and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 -order by - pss.calls desc -limit 10 -union all --- MEDIUM: High rows per call queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Rows Per Call Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, - 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || - ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/queries-limit.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 -order by - (pss.rows::numeric / NULLIF(pss.calls, 0)) desc -limit 10 -union all --- MEDIUM: High shared block reads per call queries -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'High Shared Block Reads Per Call Queries' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, - 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || - ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/using-explain.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 -order by - (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc -limit 10 -union all --- MEDIUM: Top queries by WAL bytes per call (PG16+) -select - 'MEDIUM' as severity, - 'Query Health' as category, - 'Top Queries by WAL Bytes Per Call' as check_name, - 'queryid: ' || pss.queryid::text as object_name, - 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, - 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( - ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), - 2 - ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || - ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || - ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, - 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, - 'https://www.postgresql.org/docs/current/pgstatstatements.html \ - https://www.postgresql.org/docs/current/wal-intro.html \ - https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, - 3 as severity_order -from - pg_stat_statements pss -where - pss.calls >= 20 - and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 - and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 -order by - ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc -limit 10 -union all -- MEDIUM: Idle in transaction over 5 minutes -select +(select 'MEDIUM' as severity, 'Query Health' as category, 'Idle In Transaction Over 5 Minutes' as check_name, @@ -960,7 +741,7 @@ where and now() - psa.state_change > interval '5 minutes' and psa.pid <> pg_backend_pid() order by - now() - psa.state_change desc + now() - psa.state_change desc) union all -- LOW: Roles that have never logged in (with LOGIN rights) (WITH ur AS ( From 5b04be44e2cd1ba65b1cdb64e7c8adc1e096cd62 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 22:10:26 -0700 Subject: [PATCH 10/14] revert changes, add pgstat check --- .github/workflows/aws-rds-opentofu.yml | 114 -------------------- .github/workflows/gcp-cloudsql-opentofu.yml | 111 ------------------- 2 files changed, 225 deletions(-) delete mode 100644 .github/workflows/aws-rds-opentofu.yml delete mode 100644 .github/workflows/gcp-cloudsql-opentofu.yml diff --git a/.github/workflows/aws-rds-opentofu.yml b/.github/workflows/aws-rds-opentofu.yml deleted file mode 100644 index f6b1f13..0000000 --- a/.github/workflows/aws-rds-opentofu.yml +++ /dev/null @@ -1,114 +0,0 @@ -name: AWS RDS OpenTofu - -on: - workflow_dispatch: - inputs: - action: - description: "OpenTofu action" - required: true - type: choice - default: plan - options: - - plan - - apply - - destroy - postgres_version: - description: "Target PostgreSQL version" - required: true - type: choice - default: pg18 - options: - - pg15 - - pg16 - - pg17 - - pg18 - allowed_cidr_block: - description: "CIDR allowed to connect (example: 203.0.113.10/32). Leave blank to use AWS_ALLOWED_CIDR_BLOCK secret." - required: false - type: string - aws_region: - description: "AWS region" - required: true - type: string - default: us-west-2 - -concurrency: - group: aws-rds-${{ inputs.postgres_version }} - cancel-in-progress: false - -jobs: - opentofu: - name: ${{ inputs.action }} ${{ inputs.postgres_version }} - runs-on: [self-hosted, linux, pgfirstaid-ci] - permissions: - contents: read - id-token: write - defaults: - run: - working-directory: testing/aws/deploy/${{ inputs.postgres_version }} - - env: - TF_IN_AUTOMATION: "true" - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - clean: false - - - name: Setup OpenTofu - uses: opentofu/setup-opentofu@v1 - - - name: Configure AWS credentials (OIDC role) - if: ${{ secrets.AWS_ROLE_TO_ASSUME != '' }} - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} - aws-region: ${{ inputs.aws_region }} - - - name: Configure AWS credentials (access keys) - if: ${{ secrets.AWS_ROLE_TO_ASSUME == '' }} - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-session-token: ${{ secrets.AWS_SESSION_TOKEN }} - aws-region: ${{ inputs.aws_region }} - - - name: Resolve allowed CIDR - shell: bash - run: | - CIDR="${{ inputs.allowed_cidr_block }}" - if [ -z "$CIDR" ]; then - CIDR="${{ secrets.AWS_ALLOWED_CIDR_BLOCK }}" - fi - - if [ -z "$CIDR" ]; then - echo "::error::No allowed CIDR provided. Set input 'allowed_cidr_block' or secret 'AWS_ALLOWED_CIDR_BLOCK'." - exit 1 - fi - - echo "TF_VAR_allowed_cidr_block=$CIDR" >> "$GITHUB_ENV" - - - name: OpenTofu init - run: tofu init -input=false - - - name: OpenTofu validate - run: tofu validate - - - name: OpenTofu plan - if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} - run: tofu plan -input=false -out=tfplan - - - name: OpenTofu apply - if: ${{ inputs.action == 'apply' }} - run: tofu apply -input=false -auto-approve tfplan - - - name: Show endpoint - if: ${{ inputs.action == 'apply' }} - run: | - echo "RDS endpoint: $(tofu output -raw endpoint)" - - - name: OpenTofu destroy - if: ${{ inputs.action == 'destroy' }} - run: tofu destroy -input=false -auto-approve diff --git a/.github/workflows/gcp-cloudsql-opentofu.yml b/.github/workflows/gcp-cloudsql-opentofu.yml deleted file mode 100644 index 72bec76..0000000 --- a/.github/workflows/gcp-cloudsql-opentofu.yml +++ /dev/null @@ -1,111 +0,0 @@ -name: GCP Cloud SQL OpenTofu - -on: - workflow_dispatch: - inputs: - action: - description: "OpenTofu action" - required: true - type: choice - default: plan - options: - - plan - - apply - - destroy - postgres_version: - description: "Target PostgreSQL version" - required: true - type: choice - default: pg18 - options: - - pg15 - - pg16 - - pg17 - - pg18 - personal_ip: - description: "IP or CIDR allowed to connect (example: 203.0.113.10/32). Leave blank to use GCP_PERSONAL_IP secret." - required: false - type: string - -concurrency: - group: gcp-cloudsql-${{ inputs.postgres_version }} - cancel-in-progress: false - -jobs: - opentofu: - name: ${{ inputs.action }} ${{ inputs.postgres_version }} - runs-on: [self-hosted, linux, pgfirstaid-ci] - permissions: - contents: read - id-token: write - defaults: - run: - working-directory: testing/gcp/deploy/${{ inputs.postgres_version }} - - env: - TF_IN_AUTOMATION: "true" - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - clean: false - - - name: Authenticate to Google Cloud (OIDC) - if: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER != '' && secrets.GCP_SERVICE_ACCOUNT != '' }} - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} - service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} - - - name: Authenticate to Google Cloud (service account key) - if: ${{ !(secrets.GCP_WORKLOAD_IDENTITY_PROVIDER != '' && secrets.GCP_SERVICE_ACCOUNT != '') }} - uses: google-github-actions/auth@v2 - with: - credentials_json: ${{ secrets.GCP_CREDENTIALS_JSON }} - - - name: Setup gcloud CLI - uses: google-github-actions/setup-gcloud@v2 - - - name: Setup OpenTofu - uses: opentofu/setup-opentofu@v1 - - - name: Resolve personal IP - shell: bash - run: | - PERSONAL_IP="${{ inputs.personal_ip }}" - if [ -z "$PERSONAL_IP" ]; then - PERSONAL_IP="${{ secrets.GCP_PERSONAL_IP }}" - fi - - if [ -z "$PERSONAL_IP" ]; then - echo "::error::No personal IP provided. Set input 'personal_ip' or secret 'GCP_PERSONAL_IP'." - exit 1 - fi - - echo "TF_VAR_personal_ip=$PERSONAL_IP" >> "$GITHUB_ENV" - - - name: OpenTofu init - run: tofu init -input=false - - - name: OpenTofu validate - run: tofu validate - - - name: OpenTofu plan - if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} - run: tofu plan -input=false -out=tfplan - - - name: OpenTofu apply - if: ${{ inputs.action == 'apply' }} - run: tofu apply -input=false -auto-approve tfplan - - - name: Show connection details - if: ${{ inputs.action == 'apply' }} - run: | - echo "Instance: $(tofu output -raw instance_name)" - echo "Connection: $(tofu output -raw instance_connection_name)" - echo "Public IP: $(tofu output -raw public_ip_address)" - - - name: OpenTofu destroy - if: ${{ inputs.action == 'destroy' }} - run: tofu destroy -input=false -auto-approve From 9feda3ec930449537bff2d0984519b9233266dbe Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 22:22:31 -0700 Subject: [PATCH 11/14] version 2.0.0 pre SCALE 23x --- pgFirstAid.sql | 277 ++++++++++++++++++++++++++++++++ testing/pgTAP/03_high_tests.sql | 83 +++++++++- view_pgFirstAid.sql | 275 +++++++++++++++++++++++++++++++ view_pgFirstAid_managed.sql | 275 +++++++++++++++++++++++++++++++ 4 files changed, 909 insertions(+), 1 deletion(-) diff --git a/pgFirstAid.sql b/pgFirstAid.sql index 8e9b209..833a72e 100644 --- a/pgFirstAid.sql +++ b/pgFirstAid.sql @@ -1,3 +1,272 @@ +create or replace +function pgfirstaid_pg_stat_statements_checks() +returns table ( + severity TEXT, + category TEXT, + check_name TEXT, + object_name TEXT, + issue_description TEXT, + current_value TEXT, + recommended_action TEXT, + documentation_link TEXT, + severity_order INTEGER +) as $$ +begin + if not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements') then + return; + end if; + + return query +with pss as ( + select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows + from + pg_stat_statements + where + calls > 0 + order by + total_exec_time desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.mean_exec_time > 100 + order by + pss.mean_exec_time desc + limit 10; + + return query +with pss as ( + select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time + from + pg_stat_statements + where + (temp_blks_read + temp_blks_written) > 0 + order by + (temp_blks_read + temp_blks_written) desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 + order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time + order by + pss.stddev_exec_time desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 + order by + pss.calls desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 + order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 + order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 + order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc + limit 10; +end; +$$ language plpgsql; + create or replace function pg_firstAid() returns table ( @@ -700,6 +969,14 @@ where where extname = 'pg_stat_statements' ); +-- MEDIUM: pg_stat_statements dependent checks +insert + into + health_results +select + * +from + pgfirstaid_pg_stat_statements_checks(); -- MEDIUM: Top 10 expensive active queries by runtime with eq as ( select diff --git a/testing/pgTAP/03_high_tests.sql b/testing/pgTAP/03_high_tests.sql index 5326b8d..5664c42 100644 --- a/testing/pgTAP/03_high_tests.sql +++ b/testing/pgTAP/03_high_tests.sql @@ -1,5 +1,5 @@ BEGIN; -SELECT plan(24); +SELECT plan(42); SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Current Blocked/Blocking Queries'), @@ -64,6 +64,87 @@ SELECT ok( 'View executes pg_stat_statements Extension Missing check' ); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Total Execution Time') ELSE true END), + 'Function executes Top 10 Queries by Total Execution Time check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Total Execution Time') ELSE true END), + 'View executes Top 10 Queries by Total Execution Time check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Mean Execution Time Queries') ELSE true END), + 'Function executes High Mean Execution Time Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Mean Execution Time Queries') ELSE true END), + 'View executes High Mean Execution Time Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top 10 Queries by Temp Block Spills') ELSE true END), + 'Function executes Top 10 Queries by Temp Block Spills check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top 10 Queries by Temp Block Spills') ELSE true END), + 'View executes Top 10 Queries by Temp Block Spills check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Low Cache Hit Ratio Queries') ELSE true END), + 'Function executes Low Cache Hit Ratio Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Low Cache Hit Ratio Queries') ELSE true END), + 'View executes Low Cache Hit Ratio Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Runtime Variance Queries') ELSE true END), + 'Function executes High Runtime Variance Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Runtime Variance Queries') ELSE true END), + 'View executes High Runtime Variance Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Calls Low Value Queries') ELSE true END), + 'Function executes High Calls Low Value Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Calls Low Value Queries') ELSE true END), + 'View executes High Calls Low Value Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Rows Per Call Queries') ELSE true END), + 'Function executes High Rows Per Call Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Rows Per Call Queries') ELSE true END), + 'View executes High Rows Per Call Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'High Shared Block Reads Per Call Queries') ELSE true END), + 'Function executes High Shared Block Reads Per Call Queries check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'High Shared Block Reads Per Call Queries') ELSE true END), + 'View executes High Shared Block Reads Per Call Queries check when pg_stat_statements is installed' +); + +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Top Queries by WAL Bytes Per Call') ELSE true END), + 'Function executes Top Queries by WAL Bytes Per Call check when pg_stat_statements is installed' +); +SELECT ok( + (SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements') THEN (SELECT count(*) >= 0 FROM v_pgfirstaid WHERE check_name = 'Top Queries by WAL Bytes Per Call') ELSE true END), + 'View executes Top Queries by WAL Bytes Per Call check when pg_stat_statements is installed' +); + SELECT ok( (SELECT count(*) >= 0 FROM pg_firstAid() WHERE check_name = 'Lock-Wait-Heavy Active Queries'), 'Function executes Lock-Wait-Heavy Active Queries check' diff --git a/view_pgFirstAid.sql b/view_pgFirstAid.sql index 06e6af1..c102845 100644 --- a/view_pgFirstAid.sql +++ b/view_pgFirstAid.sql @@ -1,3 +1,272 @@ +create or replace +function pgfirstaid_pg_stat_statements_checks() +returns table ( + severity TEXT, + category TEXT, + check_name TEXT, + object_name TEXT, + issue_description TEXT, + current_value TEXT, + recommended_action TEXT, + documentation_link TEXT, + severity_order INTEGER +) as $$ +begin + if not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements') then + return; + end if; + + return query +with pss as ( + select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows + from + pg_stat_statements + where + calls > 0 + order by + total_exec_time desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.mean_exec_time > 100 + order by + pss.mean_exec_time desc + limit 10; + + return query +with pss as ( + select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time + from + pg_stat_statements + where + (temp_blks_read + temp_blks_written) > 0 + order by + (temp_blks_read + temp_blks_written) desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 + order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time + order by + pss.stddev_exec_time desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 + order by + pss.calls desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 + order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 + order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 + order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc + limit 10; +end; +$$ language plpgsql; + -- Adding dropping of the view instead of replace because of conversion issues with new health checks. -- This way we start with a fresh view. drop view if exists v_pgfirstAid; @@ -638,6 +907,12 @@ where extname = 'pg_stat_statements' ) union all +-- MEDIUM: pg_stat_statements dependent checks +(select + * +from + pgfirstaid_pg_stat_statements_checks()) +union all -- MEDIUM: Top 10 expensive active queries by runtime (with eq as ( select diff --git a/view_pgFirstAid_managed.sql b/view_pgFirstAid_managed.sql index 8aee3e4..7807d7f 100644 --- a/view_pgFirstAid_managed.sql +++ b/view_pgFirstAid_managed.sql @@ -1,3 +1,272 @@ +create or replace +function pgfirstaid_pg_stat_statements_checks() +returns table ( + severity TEXT, + category TEXT, + check_name TEXT, + object_name TEXT, + issue_description TEXT, + current_value TEXT, + recommended_action TEXT, + documentation_link TEXT, + severity_order INTEGER +) as $$ +begin + if not exists ( + select + 1 + from + pg_extension + where + extname = 'pg_stat_statements') then + return; + end if; + + return query +with pss as ( + select + queryid, + query, + calls, + total_exec_time, + mean_exec_time, + rows + from + pg_stat_statements + where + calls > 0 + order by + total_exec_time desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Total Execution Time' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with the highest total execution time are usually the best optimization targets for overall workload improvement' as issue_description, + 'calls: ' || pss.calls || ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || ', rows: ' || pss.rows || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Run EXPLAIN (ANALYZE, BUFFERS) and focus on reducing total runtime for these fingerprints first' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Mean Execution Time Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Queries with high average runtime and enough call volume are underperforming and likely user-visible' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add or improve indexes and rewrite query predicates to reduce per-execution latency' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.mean_exec_time > 100 + order by + pss.mean_exec_time desc + limit 10; + + return query +with pss as ( + select + queryid, + query, + calls, + temp_blks_read, + temp_blks_written, + total_exec_time + from + pg_stat_statements + where + (temp_blks_read + temp_blks_written) > 0 + order by + (temp_blks_read + temp_blks_written) desc + limit 10) + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top 10 Queries by Temp Block Spills' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Frequent temp block usage points to sort or hash operations spilling to disk and causing avoidable latency' as issue_description, + 'calls: ' || pss.calls || ', temp_blks_read: ' || pss.temp_blks_read || + ', temp_blks_written: ' || pss.temp_blks_written || ', total_exec_time_ms: ' || + round(pss.total_exec_time::numeric, 2) || ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row width, improve index support for sort or group patterns, and tune work_mem cautiously' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/runtime-config-resource.html#GUC-WORK-MEM \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pss; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Low Cache Hit Ratio Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Low buffer cache hit ratio indicates heavy physical reads and likely missing indexes or poor filtering' as issue_description, + 'calls: ' || pss.calls || ', cache_hit_pct: ' || round( + 100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0), + 2 + ) || ', shared_blks_read: ' || pss.shared_blks_read || ', shared_blks_hit: ' || pss.shared_blks_hit || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Prioritize index tuning and query filtering to reduce disk reads for these statements' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_hit + pss.shared_blks_read) > 0 + and (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) < 90 + order by + (100.0 * pss.shared_blks_hit / NULLIF(pss.shared_blks_hit + pss.shared_blks_read, 0)) asc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Runtime Variance Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High runtime variance can indicate plan instability, skewed data distribution, or parameter sensitivity' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', stddev_exec_time_ms: ' || round(pss.stddev_exec_time::numeric, 2) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || ', query: ' || + left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Check plan stability with EXPLAIN (ANALYZE, BUFFERS), update statistics, and review parameterized execution paths' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/routine-vacuuming.html \ + https://www.postgresql.org/docs/current/using-explain.html' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and pss.stddev_exec_time > pss.mean_exec_time + order by + pss.stddev_exec_time desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Calls Low Value Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'Very high call volume with low per-call value can create avoidable overhead and crowd out expensive work' as issue_description, + 'calls: ' || pss.calls || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 3) || + ', total_exec_time_ms: ' || round(pss.total_exec_time::numeric, 2) || + ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Batch repeated requests, cache stable lookups, and reduce N+1 query patterns in the application layer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 5000 + and pss.mean_exec_time <= 2 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) <= 2 + order by + pss.calls desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Rows Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High rows returned per execution often indicates over-fetching or missing selective filters' as issue_description, + 'calls: ' || pss.calls || ', rows_per_call: ' || round((pss.rows::numeric / NULLIF(pss.calls, 0)), 2) || + ', total_rows: ' || pss.rows || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Add tighter predicates, pagination, and narrower SELECT lists to reduce unnecessary row transfer' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/queries-limit.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.rows::numeric / NULLIF(pss.calls, 0)) > 10000 + order by + (pss.rows::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'High Shared Block Reads Per Call Queries' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High shared block reads per call usually points to heavy table or index scans and poor locality' as issue_description, + 'calls: ' || pss.calls || ', shared_blks_read_per_call: ' || round((pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)), 2) || + ', shared_blks_read: ' || pss.shared_blks_read || ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Use EXPLAIN (ANALYZE, BUFFERS) to add selective indexes and reduce pages read per execution' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/using-explain.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) > 1000 + order by + (pss.shared_blks_read::numeric / NULLIF(pss.calls, 0)) desc + limit 10; + + return query + select + 'MEDIUM' as severity, + 'Query Health' as category, + 'Top Queries by WAL Bytes Per Call' as check_name, + 'queryid: ' || pss.queryid::text as object_name, + 'High WAL generation per execution can indicate heavy write amplification and expensive update patterns' as issue_description, + 'calls: ' || pss.calls || ', wal_bytes_per_call: ' || round( + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)), + 2 + ) || ', wal_bytes_total: ' || round((to_jsonb(pss)->>'wal_bytes')::numeric, 2) || + ', mean_exec_time_ms: ' || round(pss.mean_exec_time::numeric, 2) || + ', query: ' || left(regexp_replace(pss.query, E'[\n\r\t]+', ' ', 'g'), 350) as current_value, + 'Reduce row churn, batch writes where possible, and review index maintenance cost for heavy write queries' as recommended_action, + 'https://www.postgresql.org/docs/current/pgstatstatements.html \ + https://www.postgresql.org/docs/current/wal-intro.html \ + https://www.tigerdata.com/blog/using-pg-stat-statements-to-optimize-queries' as documentation_link, + 3 as severity_order + from + pg_stat_statements pss + where + pss.calls >= 20 + and coalesce((to_jsonb(pss)->>'wal_bytes')::numeric, 0) > 0 + and ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) > 1048576 + order by + ((to_jsonb(pss)->>'wal_bytes')::numeric / NULLIF(pss.calls, 0)) desc + limit 10; +end; +$$ language plpgsql; + -- Adding dropping of the view instead of replace because of conversion issues with new health checks. -- This way we start with a fresh view. drop view if exists v_pgfirstAid; @@ -632,6 +901,12 @@ where extname = 'pg_stat_statements' ) union all +-- MEDIUM: pg_stat_statements dependent checks +(select + * +from + pgfirstaid_pg_stat_statements_checks()) +union all -- MEDIUM: Top 10 expensive active queries by runtime (with eq as ( select From f3c6dc87be95739536da04ab6c6ce2e2c4cf33af Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 25 Feb 2026 22:32:05 -0700 Subject: [PATCH 12/14] Broke the workflow! --- .github/workflows/integration-pg-matrix.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-pg-matrix.yml b/.github/workflows/integration-pg-matrix.yml index 0238b78..23c7e2d 100644 --- a/.github/workflows/integration-pg-matrix.yml +++ b/.github/workflows/integration-pg-matrix.yml @@ -2,7 +2,7 @@ name: Python + pgTAP Integration (PG15-PG18) on: pull_request: - types: [opened] + types: [opened, synchronize] paths: - pgFirstAid.sql - view_pgFirstAid.sql From 0b736547e862cdac6b4c39adf4c02d5090ddf7cc Mon Sep 17 00:00:00 2001 From: justin Date: Sat, 28 Feb 2026 13:59:13 -0700 Subject: [PATCH 13/14] init: workflows for testing big 3 --- .github/workflows/deploy-aws-rds.yml | 215 +++++++++++++++ .github/workflows/deploy-gcp-postgres.yml | 249 ++++++++++++++++++ testing/aws/deploy/pg15/locals.tf | 2 +- testing/aws/deploy/pg15/main.tf | 1 - testing/aws/deploy/pg15/variables.tf | 5 +- testing/aws/deploy/pg16/main.tf | 1 - testing/aws/deploy/pg16/variables.tf | 4 - testing/aws/deploy/pg17/main.tf | 1 - testing/aws/deploy/pg17/variables.tf | 4 - testing/aws/deploy/pg18/main.tf | 1 - testing/aws/deploy/pg18/variables.tf | 4 - .../aws/opentofu/modules/nonaurora/main.tf | 31 +-- .../opentofu/modules/nonaurora/variables.tf | 6 - testing/gcp/deploy/pg15/main.tf | 2 +- testing/gcp/deploy/pg15/vars.tf | 9 +- testing/gcp/deploy/pg16/main.tf | 2 +- testing/gcp/deploy/pg16/vars.tf | 9 +- testing/gcp/deploy/pg17/main.tf | 2 +- testing/gcp/deploy/pg17/vars.tf | 9 +- testing/gcp/deploy/pg18/main.tf | 2 +- testing/gcp/deploy/pg18/vars.tf | 9 +- testing/gcp/opentofu/models/postgres/main.tf | 19 +- .../gcp/opentofu/models/postgres/variables.tf | 9 + 23 files changed, 520 insertions(+), 76 deletions(-) create mode 100644 .github/workflows/deploy-aws-rds.yml create mode 100644 .github/workflows/deploy-gcp-postgres.yml diff --git a/.github/workflows/deploy-aws-rds.yml b/.github/workflows/deploy-aws-rds.yml new file mode 100644 index 0000000..3ffceff --- /dev/null +++ b/.github/workflows/deploy-aws-rds.yml @@ -0,0 +1,215 @@ +name: Deploy AWS RDS PostgreSQL + +on: + workflow_dispatch: + inputs: + target: + description: "Deploy target under testing/aws/deploy" + required: true + type: choice + options: + - all + - pg15 + - pg16 + - pg17 + - pg18 + default: all + command: + description: "OpenTofu command" + required: true + type: choice + options: + - plan + - apply + - destroy + default: plan + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: read + +jobs: + resolve: + if: | + ( + github.event_name == 'workflow_dispatch' && + github.actor == vars.DEPLOY_TRIGGER_USER + ) || + ( + github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + startsWith(github.event.comment.body, '/deploy-aws-rds') && + github.event.comment.user.login == vars.DEPLOY_TRIGGER_USER && + ( + github.event.comment.author_association == 'OWNER' || + github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' + ) + ) + name: Resolve deploy arguments + runs-on: ubuntu-latest + + outputs: + command: ${{ steps.args.outputs.command }} + versions_json: ${{ steps.args.outputs.versions_json }} + checkout_ref: ${{ steps.pr.outputs.head_sha || github.sha }} + + steps: + - name: Validate trigger user is configured + run: | + if [ -z "${{ vars.DEPLOY_TRIGGER_USER }}" ]; then + echo "::error::Repository variable DEPLOY_TRIGGER_USER is not set." + exit 1 + fi + + - name: Resolve deploy arguments + id: args + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + target="${{ inputs.target }}" + command="${{ inputs.command }}" + else + comment="${{ github.event.comment.body }}" + read -r trigger arg1 arg2 _ <<<"${comment}" + + if [ "${trigger}" != "/deploy-aws-rds" ]; then + echo "::error::Comment must start with '/deploy-aws-rds'" + exit 1 + fi + + if [ -z "${arg1}" ]; then + target="all" + command="plan" + elif [ "${arg1}" = "plan" ] || [ "${arg1}" = "apply" ] || [ "${arg1}" = "destroy" ]; then + target="all" + command="${arg1}" + else + target="${arg1}" + if [ -z "${arg2}" ]; then + command="plan" + else + command="${arg2}" + fi + fi + fi + + case "${target}" in + all|pg15|pg16|pg17|pg18) ;; + *) + echo "::error::Invalid target: ${target}" + exit 1 + ;; + esac + + case "${command}" in + plan|apply|destroy) ;; + *) + echo "::error::Invalid command: ${command}" + exit 1 + ;; + esac + + if [ "${target}" = "all" ]; then + versions_json='["pg15","pg16","pg17","pg18"]' + else + versions_json="[\"${target}\"]" + fi + + echo "target=${target}" >> "$GITHUB_OUTPUT" + echo "command=${command}" >> "$GITHUB_OUTPUT" + echo "versions_json=${versions_json}" >> "$GITHUB_OUTPUT" + + - name: Resolve pull request head + id: pr + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + with: + script: | + const pull_number = context.payload.issue.number; + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number, + }); + + core.setOutput('head_sha', pr.head.sha); + core.setOutput('is_fork', String(pr.head.repo.fork)); + + - name: Block fork pull requests + if: github.event_name == 'issue_comment' && steps.pr.outputs.is_fork == 'true' + run: | + echo "::error::Comment-triggered deploy is blocked for fork PRs." + exit 1 + + - name: Show resolved inputs + run: | + echo "Target: ${{ steps.args.outputs.target }}" + echo "Versions: ${{ steps.args.outputs.versions_json }}" + echo "Command: ${{ steps.args.outputs.command }}" + + deploy: + name: ${{ needs.resolve.outputs.command }} ${{ matrix.postgres_version }} + runs-on: ubuntu-latest + needs: resolve + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + + env: + AWS_REGION: us-west-2 + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ needs.resolve.outputs.checkout_ref }} + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ env.AWS_REGION }} + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_version: latest + + - name: OpenTofu init + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu init -input=false + + - name: OpenTofu validate + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu validate + + - name: OpenTofu plan + if: needs.resolve.outputs.command == 'plan' || needs.resolve.outputs.command == 'apply' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu plan -input=false + + - name: OpenTofu apply + if: needs.resolve.outputs.command == 'apply' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu apply -auto-approve -input=false + + - name: OpenTofu destroy + if: needs.resolve.outputs.command == 'destroy' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: tofu destroy -auto-approve -input=false + + - name: Show endpoint output + if: needs.resolve.outputs.command == 'apply' + working-directory: testing/aws/deploy/${{ matrix.postgres_version }} + run: | + endpoint="$(tofu output -raw endpoint)" + { + echo "### Deployment Output" + echo + echo "- Version: ${{ matrix.postgres_version }}" + echo "- Endpoint: ${endpoint}" + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/deploy-gcp-postgres.yml b/.github/workflows/deploy-gcp-postgres.yml new file mode 100644 index 0000000..439895a --- /dev/null +++ b/.github/workflows/deploy-gcp-postgres.yml @@ -0,0 +1,249 @@ +name: Deploy GCP Cloud SQL PostgreSQL + +on: + workflow_dispatch: + inputs: + target: + description: "Deploy target under testing/gcp/deploy" + required: true + type: choice + options: + - all + - pg15 + - pg16 + - pg17 + - pg18 + default: all + command: + description: "OpenTofu command" + required: true + type: choice + options: + - plan + - apply + - destroy + default: plan + personal_ip_cidr: + description: "Your CIDR for Cloud SQL access (optional). Leave blank to use secret DEPLOY_PERSONAL_IP_CIDR" + required: false + type: string + default: "" + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: read + +jobs: + resolve: + if: | + ( + github.event_name == 'workflow_dispatch' && + github.actor == vars.DEPLOY_TRIGGER_USER + ) || + ( + github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + startsWith(github.event.comment.body, '/deploy-gcp-pg') && + github.event.comment.user.login == vars.DEPLOY_TRIGGER_USER && + ( + github.event.comment.author_association == 'OWNER' || + github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' + ) + ) + name: Resolve deploy arguments + runs-on: ubuntu-latest + + outputs: + command: ${{ steps.args.outputs.command }} + versions_json: ${{ steps.args.outputs.versions_json }} + runner_ip_cidr: ${{ steps.args.outputs.runner_ip_cidr }} + personal_ip_cidr: ${{ steps.args.outputs.personal_ip_cidr }} + checkout_ref: ${{ steps.pr.outputs.head_sha || github.sha }} + + steps: + - name: Validate trigger user is configured + run: | + if [ -z "${{ vars.DEPLOY_TRIGGER_USER }}" ]; then + echo "::error::Repository variable DEPLOY_TRIGGER_USER is not set." + exit 1 + fi + + - name: Resolve deploy arguments + id: args + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + target="${{ inputs.target }}" + command="${{ inputs.command }}" + personal_ip_input="${{ inputs.personal_ip_cidr }}" + else + comment="${{ github.event.comment.body }}" + read -r trigger arg1 arg2 _ <<<"${comment}" + + if [ "${trigger}" != "/deploy-gcp-pg" ]; then + echo "::error::Comment must start with '/deploy-gcp-pg'" + exit 1 + fi + + if [ -z "${arg1}" ]; then + target="all" + command="plan" + elif [ "${arg1}" = "plan" ] || [ "${arg1}" = "apply" ] || [ "${arg1}" = "destroy" ]; then + target="all" + command="${arg1}" + else + target="${arg1}" + if [ -z "${arg2}" ]; then + command="plan" + else + command="${arg2}" + fi + fi + + personal_ip_input="" + fi + + case "${target}" in + all|pg15|pg16|pg17|pg18) ;; + *) + echo "::error::Invalid target: ${target}" + exit 1 + ;; + esac + + case "${command}" in + plan|apply|destroy) ;; + *) + echo "::error::Invalid command: ${command}" + exit 1 + ;; + esac + + if [ "${target}" = "all" ]; then + versions_json='["pg15","pg16","pg17","pg18"]' + else + versions_json="[\"${target}\"]" + fi + + runner_ip="$(curl -fsSL https://checkip.amazonaws.com | tr -d '\n')" + runner_ip_cidr="${runner_ip}/32" + + if [ -n "${personal_ip_input}" ]; then + personal_ip_cidr="${personal_ip_input}" + elif [ -n "${{ secrets.DEPLOY_PERSONAL_IP_CIDR }}" ]; then + personal_ip_cidr="${{ secrets.DEPLOY_PERSONAL_IP_CIDR }}" + else + echo "::error::Set input personal_ip_cidr or repository secret DEPLOY_PERSONAL_IP_CIDR" + exit 1 + fi + + echo "target=${target}" >> "$GITHUB_OUTPUT" + echo "command=${command}" >> "$GITHUB_OUTPUT" + echo "versions_json=${versions_json}" >> "$GITHUB_OUTPUT" + echo "runner_ip_cidr=${runner_ip_cidr}" >> "$GITHUB_OUTPUT" + echo "personal_ip_cidr=${personal_ip_cidr}" >> "$GITHUB_OUTPUT" + + - name: Resolve pull request head + id: pr + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + with: + script: | + const pull_number = context.payload.issue.number; + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number, + }); + + core.setOutput('head_sha', pr.head.sha); + core.setOutput('is_fork', String(pr.head.repo.fork)); + + - name: Block fork pull requests + if: github.event_name == 'issue_comment' && steps.pr.outputs.is_fork == 'true' + run: | + echo "::error::Comment-triggered deploy is blocked for fork PRs." + exit 1 + + - name: Show resolved inputs + run: | + echo "Target: ${{ steps.args.outputs.target }}" + echo "Versions: ${{ steps.args.outputs.versions_json }}" + echo "Command: ${{ steps.args.outputs.command }}" + echo "Runner IP CIDR: ${{ steps.args.outputs.runner_ip_cidr }}" + + deploy: + name: ${{ needs.resolve.outputs.command }} ${{ matrix.postgres_version }} + runs-on: ubuntu-latest + needs: resolve + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ needs.resolve.outputs.checkout_ref }} + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + create_credentials_file: true + export_environment_variables: true + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_version: latest + + - name: OpenTofu init + working-directory: testing/gcp/deploy/${{ matrix.postgres_version }} + run: tofu init -input=false + + - name: Write CI tfvars + working-directory: testing/gcp/deploy/${{ matrix.postgres_version }} + run: | + cat > ci.auto.tfvars <> "$GITHUB_STEP_SUMMARY" diff --git a/testing/aws/deploy/pg15/locals.tf b/testing/aws/deploy/pg15/locals.tf index 891a331..eb96df2 100644 --- a/testing/aws/deploy/pg15/locals.tf +++ b/testing/aws/deploy/pg15/locals.tf @@ -2,7 +2,7 @@ locals { service = "pg15" database_name = "pgFirstAid" engine = "postgres" - engine_version = "" + engine_version = "15.17" engine_family = "postgres15" db_parameter_group = [ { diff --git a/testing/aws/deploy/pg15/main.tf b/testing/aws/deploy/pg15/main.tf index 6c9a1d7..001a6a3 100644 --- a/testing/aws/deploy/pg15/main.tf +++ b/testing/aws/deploy/pg15/main.tf @@ -10,7 +10,6 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group - allowed_cidr_block = var.allowed_cidr_block } output "endpoint" { diff --git a/testing/aws/deploy/pg15/variables.tf b/testing/aws/deploy/pg15/variables.tf index 478bbb6..8b13789 100644 --- a/testing/aws/deploy/pg15/variables.tf +++ b/testing/aws/deploy/pg15/variables.tf @@ -1,4 +1 @@ -variable "allowed_cidr_block" { - description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" - type = string -} + diff --git a/testing/aws/deploy/pg16/main.tf b/testing/aws/deploy/pg16/main.tf index 6c9a1d7..001a6a3 100644 --- a/testing/aws/deploy/pg16/main.tf +++ b/testing/aws/deploy/pg16/main.tf @@ -10,7 +10,6 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group - allowed_cidr_block = var.allowed_cidr_block } output "endpoint" { diff --git a/testing/aws/deploy/pg16/variables.tf b/testing/aws/deploy/pg16/variables.tf index 478bbb6..e69de29 100644 --- a/testing/aws/deploy/pg16/variables.tf +++ b/testing/aws/deploy/pg16/variables.tf @@ -1,4 +0,0 @@ -variable "allowed_cidr_block" { - description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" - type = string -} diff --git a/testing/aws/deploy/pg17/main.tf b/testing/aws/deploy/pg17/main.tf index 6c9a1d7..001a6a3 100644 --- a/testing/aws/deploy/pg17/main.tf +++ b/testing/aws/deploy/pg17/main.tf @@ -10,7 +10,6 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group - allowed_cidr_block = var.allowed_cidr_block } output "endpoint" { diff --git a/testing/aws/deploy/pg17/variables.tf b/testing/aws/deploy/pg17/variables.tf index 478bbb6..e69de29 100644 --- a/testing/aws/deploy/pg17/variables.tf +++ b/testing/aws/deploy/pg17/variables.tf @@ -1,4 +0,0 @@ -variable "allowed_cidr_block" { - description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" - type = string -} diff --git a/testing/aws/deploy/pg18/main.tf b/testing/aws/deploy/pg18/main.tf index 6c9a1d7..001a6a3 100644 --- a/testing/aws/deploy/pg18/main.tf +++ b/testing/aws/deploy/pg18/main.tf @@ -10,7 +10,6 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group - allowed_cidr_block = var.allowed_cidr_block } output "endpoint" { diff --git a/testing/aws/deploy/pg18/variables.tf b/testing/aws/deploy/pg18/variables.tf index 478bbb6..e69de29 100644 --- a/testing/aws/deploy/pg18/variables.tf +++ b/testing/aws/deploy/pg18/variables.tf @@ -1,4 +0,0 @@ -variable "allowed_cidr_block" { - description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" - type = string -} diff --git a/testing/aws/opentofu/modules/nonaurora/main.tf b/testing/aws/opentofu/modules/nonaurora/main.tf index 490a6d4..7aa97d7 100644 --- a/testing/aws/opentofu/modules/nonaurora/main.tf +++ b/testing/aws/opentofu/modules/nonaurora/main.tf @@ -12,35 +12,6 @@ data "aws_vpc" "default" { default = true } -resource "aws_security_group" "rds_sg" { - name = "${var.service}-rds-sg" - description = "Security group for ${var.service} RDS instance" - vpc_id = data.aws_vpc.default.id - - ingress { - description = "PostgreSQL access from public IP" - from_port = var.port - to_port = var.port - protocol = "tcp" - cidr_blocks = [var.allowed_cidr_block] - } - - egress { - description = "Allow all outbound traffic" - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = merge( - var.required_tags, - { - Name = "${var.service}-rds-sg" - } - ) -} - resource "random_password" "password" { length = 20 special = false @@ -56,7 +27,7 @@ resource "aws_db_instance" "rds_instance" { instance_class = local.instance_class parameter_group_name = aws_db_parameter_group.param_group.name publicly_accessible = true - vpc_security_group_ids = [aws_security_group.rds_sg.id] + vpc_security_group_ids = ["sg-0333981e44680b34b"] allocated_storage = local.allocated_storage apply_immediately = var.apply_immediately skip_final_snapshot = true diff --git a/testing/aws/opentofu/modules/nonaurora/variables.tf b/testing/aws/opentofu/modules/nonaurora/variables.tf index f39c6e8..84bf9a6 100644 --- a/testing/aws/opentofu/modules/nonaurora/variables.tf +++ b/testing/aws/opentofu/modules/nonaurora/variables.tf @@ -23,12 +23,6 @@ variable "vpc_security_group_ids" { default = [] } -variable "allowed_cidr_block" { - description = "CIDR block allowed to access the RDS instance" - type = string - default = "" -} - variable "instance_class" { description = "Instance type to use" type = string diff --git a/testing/gcp/deploy/pg15/main.tf b/testing/gcp/deploy/pg15/main.tf index 59bf02e..587f25d 100644 --- a/testing/gcp/deploy/pg15/main.tf +++ b/testing/gcp/deploy/pg15/main.tf @@ -11,7 +11,7 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks } output "instance_name" { diff --git a/testing/gcp/deploy/pg15/vars.tf b/testing/gcp/deploy/pg15/vars.tf index 0ea6a42..467cd69 100644 --- a/testing/gcp/deploy/pg15/vars.tf +++ b/testing/gcp/deploy/pg15/vars.tf @@ -1,4 +1,7 @@ -variable "personal_ip" { - description ="Personal IP to allow access to the instance" - type = string +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) } diff --git a/testing/gcp/deploy/pg16/main.tf b/testing/gcp/deploy/pg16/main.tf index 59bf02e..587f25d 100644 --- a/testing/gcp/deploy/pg16/main.tf +++ b/testing/gcp/deploy/pg16/main.tf @@ -11,7 +11,7 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks } output "instance_name" { diff --git a/testing/gcp/deploy/pg16/vars.tf b/testing/gcp/deploy/pg16/vars.tf index 0ea6a42..467cd69 100644 --- a/testing/gcp/deploy/pg16/vars.tf +++ b/testing/gcp/deploy/pg16/vars.tf @@ -1,4 +1,7 @@ -variable "personal_ip" { - description ="Personal IP to allow access to the instance" - type = string +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) } diff --git a/testing/gcp/deploy/pg17/main.tf b/testing/gcp/deploy/pg17/main.tf index 59bf02e..587f25d 100644 --- a/testing/gcp/deploy/pg17/main.tf +++ b/testing/gcp/deploy/pg17/main.tf @@ -11,7 +11,7 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks } output "instance_name" { diff --git a/testing/gcp/deploy/pg17/vars.tf b/testing/gcp/deploy/pg17/vars.tf index 0ea6a42..467cd69 100644 --- a/testing/gcp/deploy/pg17/vars.tf +++ b/testing/gcp/deploy/pg17/vars.tf @@ -1,4 +1,7 @@ -variable "personal_ip" { - description ="Personal IP to allow access to the instance" - type = string +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) } diff --git a/testing/gcp/deploy/pg18/main.tf b/testing/gcp/deploy/pg18/main.tf index 59bf02e..587f25d 100644 --- a/testing/gcp/deploy/pg18/main.tf +++ b/testing/gcp/deploy/pg18/main.tf @@ -11,7 +11,7 @@ module "postgres" { region = local.region database_name = local.database_name db_user = local.db_user - personal_ip = var.personal_ip + authorized_networks = var.authorized_networks } output "instance_name" { diff --git a/testing/gcp/deploy/pg18/vars.tf b/testing/gcp/deploy/pg18/vars.tf index ab706b4..467cd69 100644 --- a/testing/gcp/deploy/pg18/vars.tf +++ b/testing/gcp/deploy/pg18/vars.tf @@ -1,4 +1,7 @@ -variable "personal_ip" { - description = "Personal IP to allow access to the instance" - type = string +variable "authorized_networks" { + description = "Authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) } diff --git a/testing/gcp/opentofu/models/postgres/main.tf b/testing/gcp/opentofu/models/postgres/main.tf index 36e1583..00676d2 100644 --- a/testing/gcp/opentofu/models/postgres/main.tf +++ b/testing/gcp/opentofu/models/postgres/main.tf @@ -3,6 +3,15 @@ resource "random_password" "password" { special = false } +locals { + resolved_authorized_networks = length(var.authorized_networks) > 0 ? var.authorized_networks : [ + { + name = "allow-personal" + value = var.personal_ip + } + ] +} + resource "google_sql_database_instance" "postgres" { name = var.instance_name database_version = var.postgres_version @@ -18,9 +27,13 @@ resource "google_sql_database_instance" "postgres" { ip_configuration { ipv4_enabled = true - authorized_networks { - name = "allow-personal" - value = var.personal_ip + dynamic "authorized_networks" { + for_each = local.resolved_authorized_networks + + content { + name = authorized_networks.value.name + value = authorized_networks.value.value + } } } diff --git a/testing/gcp/opentofu/models/postgres/variables.tf b/testing/gcp/opentofu/models/postgres/variables.tf index 794b064..35a0f64 100644 --- a/testing/gcp/opentofu/models/postgres/variables.tf +++ b/testing/gcp/opentofu/models/postgres/variables.tf @@ -36,3 +36,12 @@ variable "personal_ip" { type = string default = "0.0.0.0" } + +variable "authorized_networks" { + description = "List of authorized networks for Cloud SQL" + type = list(object({ + name = string + value = string + })) + default = [] +} From b3c86f87df5324483a24c5118e2a220e42972383 Mon Sep 17 00:00:00 2001 From: justin Date: Sun, 1 Mar 2026 21:24:54 -0700 Subject: [PATCH 14/14] new workflows, edits to deploys, integration test --- .github/workflows/README.md | 81 +++++++++ .github/workflows/azure-postgres-opentofu.yml | 154 ++++++++++++++++ .github/workflows/deploy-aws-rds.yml | 42 +++++ .github/workflows/deploy-gcp-postgres.yml | 40 +++- .github/workflows/managed-db-validate.yml | 171 ++++++++++++++++++ testing/aws/deploy/pg15/main.tf | 10 + testing/aws/deploy/pg15/variables.tf | 9 + testing/aws/deploy/pg16/main.tf | 10 + testing/aws/deploy/pg16/variables.tf | 10 + testing/aws/deploy/pg17/main.tf | 10 + testing/aws/deploy/pg17/variables.tf | 10 + testing/aws/deploy/pg18/main.tf | 10 + testing/aws/deploy/pg18/variables.tf | 10 + .../aws/opentofu/modules/nonaurora/main.tf | 2 +- .../opentofu/modules/nonaurora/variables.tf | 7 + testing/azure/deploy/pg15/locals.tf | 2 +- testing/azure/deploy/pg15/main.tf | 1 + testing/azure/deploy/pg15/variables.tf | 6 + testing/azure/deploy/pg16/locals.tf | 2 +- testing/azure/deploy/pg16/main.tf | 1 + testing/azure/deploy/pg16/variables.tf | 6 + testing/azure/deploy/pg17/locals.tf | 2 +- testing/azure/deploy/pg17/main.tf | 1 + testing/azure/deploy/pg17/variables.tf | 6 + testing/azure/deploy/pg18/locals.tf | 2 +- testing/azure/deploy/pg18/main.tf | 1 + testing/azure/deploy/pg18/variables.tf | 6 + .../azure/opentofu/modules/postgres/main.tf | 4 +- .../opentofu/modules/postgres/variables.tf | 7 + testing/gcp/deploy/pg15/main.tf | 1 + testing/gcp/deploy/pg15/vars.tf | 6 + testing/gcp/deploy/pg16/main.tf | 1 + testing/gcp/deploy/pg16/vars.tf | 6 + testing/gcp/deploy/pg17/main.tf | 1 + testing/gcp/deploy/pg17/vars.tf | 6 + testing/gcp/deploy/pg18/main.tf | 1 + testing/gcp/deploy/pg18/vars.tf | 6 + testing/gcp/opentofu/models/postgres/main.tf | 2 +- .../gcp/opentofu/models/postgres/variables.tf | 7 + 39 files changed, 649 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/README.md create mode 100644 .github/workflows/azure-postgres-opentofu.yml create mode 100644 .github/workflows/managed-db-validate.yml diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 0000000..6168356 --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,81 @@ +# Manual Cloud Deploy Workflows + +This repo uses three manual deployment workflows and one reusable validation workflow: + +- `deploy-aws-rds.yml` +- `deploy-gcp-postgres.yml` +- `azure-postgres-opentofu.yml` +- `managed-db-validate.yml` (reusable via `workflow_call`) + +Deploy workflows are run manually from the Actions tab. + +AWS and GCP also support trusted PR comment triggers: + +- AWS: `/deploy-aws-rds [target|command] [command]` +- GCP: `/deploy-gcp-pg [target|command] [command]` + +## Deploy Inputs + +AWS and GCP workflows support `target` (`pg15`-`pg18` or `all`) and `command` (`plan`, `apply`, `destroy`). + +Azure workflow supports: + +- `action`: `plan`, `apply`, `destroy` +- `postgres_version`: `pg15`, `pg16`, `pg17`, `pg18` +- `personal_ip`: optional (falls back to secret) + +## Secrets + +### AWS + +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` +- `AWS_ALLOWED_CIDR_BLOCK` +- `AWS_DB_PASSWORD` + +### GCP + +- `GCP_SA_KEY` +- `DEPLOY_PERSONAL_IP_CIDR` (unless provided as workflow input) +- `GCP_DB_PASSWORD` + +### Azure + +- OIDC: `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_SUBSCRIPTION_ID` + - or service principal JSON: `AZURE_CREDENTIALS` +- `AZURE_PERSONAL_IP` (unless provided as workflow input) +- `AZURE_DB_PASSWORD` + +### Shared deploy controls + +- `DEPLOY_TRIGGER_USER` (used by AWS/GCP manual and comment-triggered deploy checks) + +## Validation Workflow + +`managed-db-validate.yml` installs `pgFirstAid.sql`, recreates `view_pgFirstAid_managed.sql`, and runs integration tests (including pgTAP coverage through the integration test harness). + +It supports three connection modes: + +- `direct`: caller passes `pg_host` +- `aws`: resolves host from `aws_db_identifier` +- `gcp`: resolves host from `gcp_project_id` + `gcp_instance_name` + +Current wiring: + +- Azure apply calls `managed-db-validate.yml` automatically after deploy. +- AWS apply calls `managed-db-validate.yml` for each selected version after deploy. +- GCP apply calls `managed-db-validate.yml` for each selected version after deploy. + +## Secret Handling + +- DB passwords are passed to OpenTofu as `TF_VAR_db_password`. +- Password variables in the OpenTofu stacks are marked `sensitive = true`. +- Workflows use step-level environment variables and masking for secret values used in shell steps. +- Avoid printing secret values in custom debug statements. + +## Recommended Run Order + +1. Run `plan` +2. Run `apply` +3. Confirm validation results +4. Run `destroy` when done with test resources diff --git a/.github/workflows/azure-postgres-opentofu.yml b/.github/workflows/azure-postgres-opentofu.yml new file mode 100644 index 0000000..32f519d --- /dev/null +++ b/.github/workflows/azure-postgres-opentofu.yml @@ -0,0 +1,154 @@ +name: Azure PostgreSQL OpenTofu + +on: + workflow_dispatch: + inputs: + action: + description: "OpenTofu action" + required: true + type: choice + default: plan + options: + - plan + - apply + - destroy + postgres_version: + description: "Target PostgreSQL version" + required: true + type: choice + default: pg18 + options: + - pg15 + - pg16 + - pg17 + - pg18 + personal_ip: + description: "IP allowed to connect (example: 203.0.113.10). Leave blank to use AZURE_PERSONAL_IP secret." + required: false + type: string + +concurrency: + group: azure-postgres-${{ inputs.postgres_version }} + cancel-in-progress: false + +jobs: + opentofu: + name: ${{ inputs.action }} ${{ inputs.postgres_version }} + runs-on: [self-hosted, linux, pgfirstaid-ci] + outputs: + pg_host: ${{ steps.capture_connection.outputs.pg_host }} + pg_port: ${{ steps.capture_connection.outputs.pg_port }} + pg_user: ${{ steps.capture_connection.outputs.pg_user }} + pg_database: ${{ steps.capture_connection.outputs.pg_database }} + permissions: + contents: read + id-token: write + defaults: + run: + working-directory: testing/azure/deploy/${{ inputs.postgres_version }} + + env: + TF_IN_AUTOMATION: "true" + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + clean: false + + - name: Azure login (OIDC) + if: ${{ secrets.AZURE_CLIENT_ID != '' && secrets.AZURE_TENANT_ID != '' && secrets.AZURE_SUBSCRIPTION_ID != '' }} + uses: azure/login@v2 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + + - name: Azure login (service principal JSON) + if: ${{ !(secrets.AZURE_CLIENT_ID != '' && secrets.AZURE_TENANT_ID != '' && secrets.AZURE_SUBSCRIPTION_ID != '') }} + uses: azure/login@v2 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + + - name: Resolve personal IP + shell: bash + env: + AZURE_PERSONAL_IP: ${{ secrets.AZURE_PERSONAL_IP }} + run: | + PERSONAL_IP="${{ inputs.personal_ip }}" + if [ -z "$PERSONAL_IP" ]; then + PERSONAL_IP="$AZURE_PERSONAL_IP" + fi + + if [ -z "$PERSONAL_IP" ]; then + echo "::error::No personal IP provided. Set input 'personal_ip' or secret 'AZURE_PERSONAL_IP'." + exit 1 + fi + + echo "TF_VAR_personal_ip=$PERSONAL_IP" >> "$GITHUB_ENV" + + - name: Resolve DB password + shell: bash + env: + AZURE_DB_PASSWORD: ${{ secrets.AZURE_DB_PASSWORD }} + run: | + DB_PASSWORD="$AZURE_DB_PASSWORD" + + if [ -z "$DB_PASSWORD" ]; then + echo "::error::Missing secret 'AZURE_DB_PASSWORD'." + exit 1 + fi + + echo "::add-mask::$DB_PASSWORD" + echo "TF_VAR_db_password=$DB_PASSWORD" >> "$GITHUB_ENV" + + - name: OpenTofu init + run: tofu init -input=false + + - name: OpenTofu validate + run: tofu validate + + - name: OpenTofu plan + if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} + run: tofu plan -input=false -out=tfplan + + - name: OpenTofu apply + if: ${{ inputs.action == 'apply' }} + run: tofu apply -input=false -auto-approve tfplan + + - name: Show connection details + if: ${{ inputs.action == 'apply' }} + run: | + echo "Server: $(tofu output -raw server_name)" + echo "FQDN: $(tofu output -raw server_fqdn)" + echo "Database: $(tofu output -raw database_name)" + + - name: Capture connection outputs + id: capture_connection + if: ${{ inputs.action == 'apply' }} + run: | + echo "pg_host=$(tofu output -raw server_fqdn)" >> "$GITHUB_OUTPUT" + echo "pg_port=5432" >> "$GITHUB_OUTPUT" + echo "pg_user=$(tofu output -raw db_user)" >> "$GITHUB_OUTPUT" + echo "pg_database=$(tofu output -raw database_name)" >> "$GITHUB_OUTPUT" + + - name: OpenTofu destroy + if: ${{ inputs.action == 'destroy' }} + run: tofu destroy -input=false -auto-approve + + validate: + if: ${{ inputs.action == 'apply' }} + needs: opentofu + uses: ./.github/workflows/managed-db-validate.yml + with: + pg_host: ${{ needs.opentofu.outputs.pg_host }} + pg_port: ${{ needs.opentofu.outputs.pg_port }} + pg_user: ${{ needs.opentofu.outputs.pg_user }} + pg_database: ${{ needs.opentofu.outputs.pg_database }} + pg_sslmode: require + test_view_mode: managed + secrets: + pg_password: ${{ secrets.AZURE_DB_PASSWORD }} diff --git a/.github/workflows/deploy-aws-rds.yml b/.github/workflows/deploy-aws-rds.yml index 3ffceff..7ea6824 100644 --- a/.github/workflows/deploy-aws-rds.yml +++ b/.github/workflows/deploy-aws-rds.yml @@ -179,6 +179,25 @@ jobs: with: tofu_version: latest + - name: Resolve OpenTofu variables + env: + AWS_ALLOWED_CIDR_BLOCK: ${{ secrets.AWS_ALLOWED_CIDR_BLOCK }} + AWS_DB_PASSWORD: ${{ secrets.AWS_DB_PASSWORD }} + run: | + if [ -z "$AWS_ALLOWED_CIDR_BLOCK" ]; then + echo "::error::Missing secret AWS_ALLOWED_CIDR_BLOCK" + exit 1 + fi + + if [ -z "$AWS_DB_PASSWORD" ]; then + echo "::error::Missing secret AWS_DB_PASSWORD" + exit 1 + fi + + echo "::add-mask::$AWS_DB_PASSWORD" + echo "TF_VAR_allowed_cidr_block=$AWS_ALLOWED_CIDR_BLOCK" >> "$GITHUB_ENV" + echo "TF_VAR_db_password=$AWS_DB_PASSWORD" >> "$GITHUB_ENV" + - name: OpenTofu init working-directory: testing/aws/deploy/${{ matrix.postgres_version }} run: tofu init -input=false @@ -213,3 +232,26 @@ jobs: echo "- Version: ${{ matrix.postgres_version }}" echo "- Endpoint: ${endpoint}" } >> "$GITHUB_STEP_SUMMARY" + + validate: + if: needs.resolve.outputs.command == 'apply' + name: validate ${{ matrix.postgres_version }} + needs: [resolve, deploy] + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + uses: ./.github/workflows/managed-db-validate.yml + with: + cloud_provider: aws + aws_region: us-west-2 + aws_db_identifier: ${{ matrix.postgres_version }} + pg_port: "5432" + pg_user: randoneering + pg_database: pgFirstAid + pg_sslmode: require + test_view_mode: managed + secrets: + pg_password: ${{ secrets.AWS_DB_PASSWORD }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/deploy-gcp-postgres.yml b/.github/workflows/deploy-gcp-postgres.yml index 439895a..ee72675 100644 --- a/.github/workflows/deploy-gcp-postgres.yml +++ b/.github/workflows/deploy-gcp-postgres.yml @@ -73,6 +73,8 @@ jobs: - name: Resolve deploy arguments id: args + env: + DEPLOY_PERSONAL_IP_CIDR: ${{ secrets.DEPLOY_PERSONAL_IP_CIDR }} run: | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then target="${{ inputs.target }}" @@ -132,8 +134,8 @@ jobs: if [ -n "${personal_ip_input}" ]; then personal_ip_cidr="${personal_ip_input}" - elif [ -n "${{ secrets.DEPLOY_PERSONAL_IP_CIDR }}" ]; then - personal_ip_cidr="${{ secrets.DEPLOY_PERSONAL_IP_CIDR }}" + elif [ -n "$DEPLOY_PERSONAL_IP_CIDR" ]; then + personal_ip_cidr="$DEPLOY_PERSONAL_IP_CIDR" else echo "::error::Set input personal_ip_cidr or repository secret DEPLOY_PERSONAL_IP_CIDR" exit 1 @@ -201,6 +203,18 @@ jobs: with: tofu_version: latest + - name: Resolve OpenTofu variables + env: + GCP_DB_PASSWORD: ${{ secrets.GCP_DB_PASSWORD }} + run: | + if [ -z "$GCP_DB_PASSWORD" ]; then + echo "::error::Missing secret GCP_DB_PASSWORD" + exit 1 + fi + + echo "::add-mask::$GCP_DB_PASSWORD" + echo "TF_VAR_db_password=$GCP_DB_PASSWORD" >> "$GITHUB_ENV" + - name: OpenTofu init working-directory: testing/gcp/deploy/${{ matrix.postgres_version }} run: tofu init -input=false @@ -247,3 +261,25 @@ jobs: echo "- Instance: ${instance_name}" echo "- Public IP: ${public_ip_address}" } >> "$GITHUB_STEP_SUMMARY" + + validate: + if: needs.resolve.outputs.command == 'apply' + name: validate ${{ matrix.postgres_version }} + needs: [resolve, deploy] + strategy: + fail-fast: false + matrix: + postgres_version: ${{ fromJSON(needs.resolve.outputs.versions_json) }} + uses: ./.github/workflows/managed-db-validate.yml + with: + cloud_provider: gcp + gcp_project_id: pgfirstaid + gcp_instance_name: ${{ format('pgfirstaid-{0}', matrix.postgres_version) }} + pg_port: "5432" + pg_user: randoneering + pg_database: pgFirstAid + pg_sslmode: require + test_view_mode: managed + secrets: + pg_password: ${{ secrets.GCP_DB_PASSWORD }} + gcp_sa_key: ${{ secrets.GCP_SA_KEY }} diff --git a/.github/workflows/managed-db-validate.yml b/.github/workflows/managed-db-validate.yml new file mode 100644 index 0000000..eb85e73 --- /dev/null +++ b/.github/workflows/managed-db-validate.yml @@ -0,0 +1,171 @@ +name: Managed DB Validate + +on: + workflow_call: + inputs: + cloud_provider: + required: false + type: string + default: direct + pg_host: + required: false + type: string + default: "" + pg_port: + required: false + type: string + default: "5432" + pg_user: + required: true + type: string + pg_database: + required: true + type: string + pg_sslmode: + required: false + type: string + default: require + test_view_mode: + required: false + type: string + default: managed + aws_region: + required: false + type: string + default: us-west-2 + aws_db_identifier: + required: false + type: string + default: "" + gcp_project_id: + required: false + type: string + default: "" + gcp_instance_name: + required: false + type: string + default: "" + secrets: + pg_password: + required: true + aws_access_key_id: + required: false + aws_secret_access_key: + required: false + gcp_sa_key: + required: false + +jobs: + validate: + runs-on: [self-hosted, linux, pgfirstaid-ci] + permissions: + contents: read + defaults: + run: + working-directory: testing/integration + env: + PGHOST: ${{ inputs.pg_host }} + PGPORT: ${{ inputs.pg_port }} + PGUSER: ${{ inputs.pg_user }} + PGPASSWORD: ${{ secrets.pg_password }} + PGDATABASE: ${{ inputs.pg_database }} + PGSSLMODE: ${{ inputs.pg_sslmode }} + PGFA_TEST_VIEW_MODE: ${{ inputs.test_view_mode }} + PGFA_TEST_ACTIVE_CONN_TARGET: "52" + PGFA_TEST_ACTIVE_CONN_SLEEP_SECONDS: "20" + PGFA_TEST_WAIT_TIMEOUT_SECONDS: "45" + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Configure AWS credentials + if: ${{ inputs.cloud_provider == 'aws' }} + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.aws_access_key_id }} + aws-secret-access-key: ${{ secrets.aws_secret_access_key }} + aws-region: ${{ inputs.aws_region }} + + - name: Authenticate to Google Cloud + if: ${{ inputs.cloud_provider == 'gcp' }} + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.gcp_sa_key }} + + - name: Setup gcloud + if: ${{ inputs.cloud_provider == 'gcp' }} + uses: google-github-actions/setup-gcloud@v2 + + - name: Resolve PostgreSQL host + shell: bash + run: | + host="${PGHOST}" + + if [ -z "$host" ] && [ "${{ inputs.cloud_provider }}" = "aws" ]; then + if [ -z "${{ inputs.aws_db_identifier }}" ]; then + echo "::error::aws_db_identifier is required when cloud_provider=aws and pg_host is not set" + exit 1 + fi + + host="$(aws rds describe-db-instances --db-instance-identifier "${{ inputs.aws_db_identifier }}" --query 'DBInstances[0].Endpoint.Address' --output text)" + fi + + if [ -z "$host" ] && [ "${{ inputs.cloud_provider }}" = "gcp" ]; then + if [ -z "${{ inputs.gcp_project_id }}" ] || [ -z "${{ inputs.gcp_instance_name }}" ]; then + echo "::error::gcp_project_id and gcp_instance_name are required when cloud_provider=gcp and pg_host is not set" + exit 1 + fi + + host="$(gcloud sql instances describe "${{ inputs.gcp_instance_name }}" --project "${{ inputs.gcp_project_id }}" --format='value(ipAddresses[0].ipAddress)')" + fi + + if [ -z "$host" ]; then + echo "::error::Could not determine PGHOST. Provide pg_host or cloud discovery inputs." + exit 1 + fi + + echo "PGHOST=$host" >> "$GITHUB_ENV" + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.14" + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Validate required PG env vars + run: | + missing=0 + for var in PGHOST PGPORT PGUSER PGPASSWORD PGDATABASE; do + if [ -z "${!var}" ]; then + echo "::error::Missing required secret/env: ${var}" + missing=1 + fi + done + if [ "$missing" -ne 0 ]; then + exit 1 + fi + + - name: Verify PostgreSQL client is installed + run: | + if ! command -v psql >/dev/null 2>&1; then + echo "::error::psql not found on runner. Install postgresql-client on the self-hosted VM." + exit 1 + fi + psql --version + + - name: Sync dependencies + run: uv sync + + - name: Install pgFirstAid function + run: psql -v ON_ERROR_STOP=1 -f ../../pgFirstAid.sql + + - name: Recreate managed view only + run: | + psql -v ON_ERROR_STOP=1 -c "DROP VIEW IF EXISTS v_pgfirstaid" + psql -v ON_ERROR_STOP=1 -f ../../view_pgFirstAid_managed.sql + + - name: Run integration tests (includes pgTAP suite) + run: uv run python -m pytest tests/integration -m integration diff --git a/testing/aws/deploy/pg15/main.tf b/testing/aws/deploy/pg15/main.tf index 001a6a3..801cea2 100644 --- a/testing/aws/deploy/pg15/main.tf +++ b/testing/aws/deploy/pg15/main.tf @@ -10,8 +10,18 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group + allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg15/variables.tf b/testing/aws/deploy/pg15/variables.tf index 8b13789..3b35d84 100644 --- a/testing/aws/deploy/pg15/variables.tf +++ b/testing/aws/deploy/pg15/variables.tf @@ -1 +1,10 @@ +variable "allowed_cidr_block" { + description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" + type = string +} +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/deploy/pg16/main.tf b/testing/aws/deploy/pg16/main.tf index 001a6a3..801cea2 100644 --- a/testing/aws/deploy/pg16/main.tf +++ b/testing/aws/deploy/pg16/main.tf @@ -10,8 +10,18 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group + allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg16/variables.tf b/testing/aws/deploy/pg16/variables.tf index e69de29..3b35d84 100644 --- a/testing/aws/deploy/pg16/variables.tf +++ b/testing/aws/deploy/pg16/variables.tf @@ -0,0 +1,10 @@ +variable "allowed_cidr_block" { + description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" + type = string +} + +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/deploy/pg17/main.tf b/testing/aws/deploy/pg17/main.tf index 001a6a3..801cea2 100644 --- a/testing/aws/deploy/pg17/main.tf +++ b/testing/aws/deploy/pg17/main.tf @@ -10,8 +10,18 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group + allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg17/variables.tf b/testing/aws/deploy/pg17/variables.tf index e69de29..3b35d84 100644 --- a/testing/aws/deploy/pg17/variables.tf +++ b/testing/aws/deploy/pg17/variables.tf @@ -0,0 +1,10 @@ +variable "allowed_cidr_block" { + description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" + type = string +} + +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/deploy/pg18/main.tf b/testing/aws/deploy/pg18/main.tf index 001a6a3..801cea2 100644 --- a/testing/aws/deploy/pg18/main.tf +++ b/testing/aws/deploy/pg18/main.tf @@ -10,8 +10,18 @@ module "nonaurora" { engine_version = local.engine_version family = local.engine_family db_parameter_group = local.db_parameter_group + allowed_cidr_block = var.allowed_cidr_block + db_password = var.db_password } output "endpoint" { value = module.nonaurora.rds_instance_address } + +output "database_name" { + value = local.database_name +} + +output "db_user" { + value = "randoneering" +} diff --git a/testing/aws/deploy/pg18/variables.tf b/testing/aws/deploy/pg18/variables.tf index e69de29..3b35d84 100644 --- a/testing/aws/deploy/pg18/variables.tf +++ b/testing/aws/deploy/pg18/variables.tf @@ -0,0 +1,10 @@ +variable "allowed_cidr_block" { + description = "CIDR block allowed to access the RDS instance (e.g., 1.2.3.4/32)" + type = string +} + +variable "db_password" { + description = "Master DB password" + type = string + sensitive = true +} diff --git a/testing/aws/opentofu/modules/nonaurora/main.tf b/testing/aws/opentofu/modules/nonaurora/main.tf index 7aa97d7..a25824b 100644 --- a/testing/aws/opentofu/modules/nonaurora/main.tf +++ b/testing/aws/opentofu/modules/nonaurora/main.tf @@ -23,7 +23,7 @@ resource "aws_db_instance" "rds_instance" { engine_version = var.engine_version db_name = var.database_name username = local.username - password = random_password.password.result + password = var.db_password != "" ? var.db_password : random_password.password.result instance_class = local.instance_class parameter_group_name = aws_db_parameter_group.param_group.name publicly_accessible = true diff --git a/testing/aws/opentofu/modules/nonaurora/variables.tf b/testing/aws/opentofu/modules/nonaurora/variables.tf index 84bf9a6..033844f 100644 --- a/testing/aws/opentofu/modules/nonaurora/variables.tf +++ b/testing/aws/opentofu/modules/nonaurora/variables.tf @@ -41,6 +41,13 @@ variable "username" { default = "randoneering" } +variable "db_password" { + description = "Master DB password. If empty, a random password is generated" + type = string + default = "" + sensitive = true +} + variable "skip_final_snapshot" { description = "Should a final snapshot be created on instance destroy" type = bool diff --git a/testing/azure/deploy/pg15/locals.tf b/testing/azure/deploy/pg15/locals.tf index 48da82a..92307ef 100644 --- a/testing/azure/deploy/pg15/locals.tf +++ b/testing/azure/deploy/pg15/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg15" postgres_version = "15" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg15/main.tf b/testing/azure/deploy/pg15/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg15/main.tf +++ b/testing/azure/deploy/pg15/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg15/variables.tf b/testing/azure/deploy/pg15/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg15/variables.tf +++ b/testing/azure/deploy/pg15/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/deploy/pg16/locals.tf b/testing/azure/deploy/pg16/locals.tf index 882a558..f7d69a6 100644 --- a/testing/azure/deploy/pg16/locals.tf +++ b/testing/azure/deploy/pg16/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg16" postgres_version = "16" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg16/main.tf b/testing/azure/deploy/pg16/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg16/main.tf +++ b/testing/azure/deploy/pg16/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg16/variables.tf b/testing/azure/deploy/pg16/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg16/variables.tf +++ b/testing/azure/deploy/pg16/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/deploy/pg17/locals.tf b/testing/azure/deploy/pg17/locals.tf index c6eb5bd..87421d6 100644 --- a/testing/azure/deploy/pg17/locals.tf +++ b/testing/azure/deploy/pg17/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg17" postgres_version = "17" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg17/main.tf b/testing/azure/deploy/pg17/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg17/main.tf +++ b/testing/azure/deploy/pg17/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg17/variables.tf b/testing/azure/deploy/pg17/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg17/variables.tf +++ b/testing/azure/deploy/pg17/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/deploy/pg18/locals.tf b/testing/azure/deploy/pg18/locals.tf index e8d9561..9a110a4 100644 --- a/testing/azure/deploy/pg18/locals.tf +++ b/testing/azure/deploy/pg18/locals.tf @@ -1,5 +1,5 @@ locals { - location = "eastus" + location = "westus" server_name = "pgfirstaid-pg18" postgres_version = "18" database_name = "pgFirstAid" diff --git a/testing/azure/deploy/pg18/main.tf b/testing/azure/deploy/pg18/main.tf index c3fdcca..3dd441b 100644 --- a/testing/azure/deploy/pg18/main.tf +++ b/testing/azure/deploy/pg18/main.tf @@ -10,6 +10,7 @@ module "postgres" { location = local.location database_name = local.database_name db_user = local.db_user + db_password = var.db_password personal_ip = var.personal_ip } diff --git a/testing/azure/deploy/pg18/variables.tf b/testing/azure/deploy/pg18/variables.tf index a7f01df..c914c51 100644 --- a/testing/azure/deploy/pg18/variables.tf +++ b/testing/azure/deploy/pg18/variables.tf @@ -2,3 +2,9 @@ variable "personal_ip" { description = "Personal IP to allow access to the server (format: x.x.x.x)" type = string } + +variable "db_password" { + description = "Database admin password" + type = string + sensitive = true +} diff --git a/testing/azure/opentofu/modules/postgres/main.tf b/testing/azure/opentofu/modules/postgres/main.tf index b5726ba..375e36d 100644 --- a/testing/azure/opentofu/modules/postgres/main.tf +++ b/testing/azure/opentofu/modules/postgres/main.tf @@ -15,7 +15,7 @@ resource "azurerm_postgresql_flexible_server" "postgres" { version = var.postgres_version administrator_login = var.db_user - administrator_password = random_password.password.result + administrator_password = var.db_password != "" ? var.db_password : random_password.password.result sku_name = "B_Standard_B1ms" storage_mb = 32768 @@ -23,8 +23,6 @@ resource "azurerm_postgresql_flexible_server" "postgres" { backup_retention_days = 7 geo_redundant_backup_enabled = false - zone = "1" - lifecycle { ignore_changes = [ zone, diff --git a/testing/azure/opentofu/modules/postgres/variables.tf b/testing/azure/opentofu/modules/postgres/variables.tf index a171962..3ae2171 100644 --- a/testing/azure/opentofu/modules/postgres/variables.tf +++ b/testing/azure/opentofu/modules/postgres/variables.tf @@ -31,6 +31,13 @@ variable "db_user" { default = "randoneering" } +variable "db_password" { + description = "Database admin password. If empty, a random password is generated" + type = string + default = "" + sensitive = true +} + variable "personal_ip" { description = "Personal IP to allow connections from (format: x.x.x.x)" type = string diff --git a/testing/gcp/deploy/pg15/main.tf b/testing/gcp/deploy/pg15/main.tf index 587f25d..e765a22 100644 --- a/testing/gcp/deploy/pg15/main.tf +++ b/testing/gcp/deploy/pg15/main.tf @@ -12,6 +12,7 @@ module "postgres" { database_name = local.database_name db_user = local.db_user authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg15/vars.tf b/testing/gcp/deploy/pg15/vars.tf index 467cd69..1c4a1c4 100644 --- a/testing/gcp/deploy/pg15/vars.tf +++ b/testing/gcp/deploy/pg15/vars.tf @@ -5,3 +5,9 @@ variable "authorized_networks" { value = string })) } + +variable "db_password" { + description = "Database user password" + type = string + sensitive = true +} diff --git a/testing/gcp/deploy/pg16/main.tf b/testing/gcp/deploy/pg16/main.tf index 587f25d..e765a22 100644 --- a/testing/gcp/deploy/pg16/main.tf +++ b/testing/gcp/deploy/pg16/main.tf @@ -12,6 +12,7 @@ module "postgres" { database_name = local.database_name db_user = local.db_user authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg16/vars.tf b/testing/gcp/deploy/pg16/vars.tf index 467cd69..1c4a1c4 100644 --- a/testing/gcp/deploy/pg16/vars.tf +++ b/testing/gcp/deploy/pg16/vars.tf @@ -5,3 +5,9 @@ variable "authorized_networks" { value = string })) } + +variable "db_password" { + description = "Database user password" + type = string + sensitive = true +} diff --git a/testing/gcp/deploy/pg17/main.tf b/testing/gcp/deploy/pg17/main.tf index 587f25d..e765a22 100644 --- a/testing/gcp/deploy/pg17/main.tf +++ b/testing/gcp/deploy/pg17/main.tf @@ -12,6 +12,7 @@ module "postgres" { database_name = local.database_name db_user = local.db_user authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg17/vars.tf b/testing/gcp/deploy/pg17/vars.tf index 467cd69..1c4a1c4 100644 --- a/testing/gcp/deploy/pg17/vars.tf +++ b/testing/gcp/deploy/pg17/vars.tf @@ -5,3 +5,9 @@ variable "authorized_networks" { value = string })) } + +variable "db_password" { + description = "Database user password" + type = string + sensitive = true +} diff --git a/testing/gcp/deploy/pg18/main.tf b/testing/gcp/deploy/pg18/main.tf index 587f25d..e765a22 100644 --- a/testing/gcp/deploy/pg18/main.tf +++ b/testing/gcp/deploy/pg18/main.tf @@ -12,6 +12,7 @@ module "postgres" { database_name = local.database_name db_user = local.db_user authorized_networks = var.authorized_networks + db_password = var.db_password } output "instance_name" { diff --git a/testing/gcp/deploy/pg18/vars.tf b/testing/gcp/deploy/pg18/vars.tf index 467cd69..1c4a1c4 100644 --- a/testing/gcp/deploy/pg18/vars.tf +++ b/testing/gcp/deploy/pg18/vars.tf @@ -5,3 +5,9 @@ variable "authorized_networks" { value = string })) } + +variable "db_password" { + description = "Database user password" + type = string + sensitive = true +} diff --git a/testing/gcp/opentofu/models/postgres/main.tf b/testing/gcp/opentofu/models/postgres/main.tf index 00676d2..9146940 100644 --- a/testing/gcp/opentofu/models/postgres/main.tf +++ b/testing/gcp/opentofu/models/postgres/main.tf @@ -53,5 +53,5 @@ resource "google_sql_database" "database" { resource "google_sql_user" "user" { name = var.db_user instance = google_sql_database_instance.postgres.name - password = random_password.password.result + password = var.db_password != "" ? var.db_password : random_password.password.result } diff --git a/testing/gcp/opentofu/models/postgres/variables.tf b/testing/gcp/opentofu/models/postgres/variables.tf index 35a0f64..45dccd1 100644 --- a/testing/gcp/opentofu/models/postgres/variables.tf +++ b/testing/gcp/opentofu/models/postgres/variables.tf @@ -31,6 +31,13 @@ variable "db_user" { default = "randoneering" } +variable "db_password" { + description = "Database user password. If empty, a random password is generated" + type = string + default = "" + sensitive = true +} + variable "personal_ip" { description = "Personal IP to allow connections from" type = string