From 0301165e8a08300c2fe458c7f8884886bd61dfd9 Mon Sep 17 00:00:00 2001 From: "rw-codebundle-agent[bot]" Date: Tue, 14 Apr 2026 14:00:33 +0000 Subject: [PATCH] Add gcp-dms-migration-health CodeBundle for DMS job and CDC lag monitoring. Implements design spec for issue #86: gcloud database-migration list/describe, operations review, Cloud Monitoring replication lag metrics, optional logging correlation, in-repo SLI with 0-1 score, generation rules for gcp_dms_migration_jobs, and minimal .test validation scaffolding. Made-with: Cursor --- .../gcp-dms-migration-health.yaml | 22 ++ .../gcp-dms-migration-health-sli.yaml | 46 +++ .../gcp-dms-migration-health-slx.yaml | 31 ++ .../gcp-dms-migration-health-taskset.yaml | 47 +++ .../.test/Taskfile.yaml | 24 ++ .../.test/validate-bundle-structure.sh | 11 + .../gcp-dms-migration-health/README.md | 58 ++++ .../describe-migration-jobs.sh | 82 +++++ .../fetch-dms-error-logs.sh | 68 ++++ .../fetch-dms-replication-lag-metrics.sh | 131 ++++++++ .../list-dms-operations.sh | 101 ++++++ .../list-migration-jobs.sh | 168 ++++++++++ .../gcp-dms-migration-health/runbook.robot | 293 ++++++++++++++++++ .../sli-dms-health.sh | 75 +++++ .../gcp-dms-migration-health/sli.robot | 85 +++++ 15 files changed, 1242 insertions(+) create mode 100644 codebundles/gcp-dms-migration-health/.runwhen/generation-rules/gcp-dms-migration-health.yaml create mode 100644 codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-sli.yaml create mode 100644 codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-slx.yaml create mode 100644 codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-taskset.yaml create mode 100644 codebundles/gcp-dms-migration-health/.test/Taskfile.yaml create mode 100755 codebundles/gcp-dms-migration-health/.test/validate-bundle-structure.sh create mode 100644 codebundles/gcp-dms-migration-health/README.md create mode 100755 codebundles/gcp-dms-migration-health/describe-migration-jobs.sh create mode 100755 codebundles/gcp-dms-migration-health/fetch-dms-error-logs.sh create mode 100755 codebundles/gcp-dms-migration-health/fetch-dms-replication-lag-metrics.sh create mode 100755 codebundles/gcp-dms-migration-health/list-dms-operations.sh create mode 100755 codebundles/gcp-dms-migration-health/list-migration-jobs.sh create mode 100644 codebundles/gcp-dms-migration-health/runbook.robot create mode 100755 codebundles/gcp-dms-migration-health/sli-dms-health.sh create mode 100644 codebundles/gcp-dms-migration-health/sli.robot diff --git a/codebundles/gcp-dms-migration-health/.runwhen/generation-rules/gcp-dms-migration-health.yaml b/codebundles/gcp-dms-migration-health/.runwhen/generation-rules/gcp-dms-migration-health.yaml new file mode 100644 index 00000000..1e237f2d --- /dev/null +++ b/codebundles/gcp-dms-migration-health/.runwhen/generation-rules/gcp-dms-migration-health.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: gcp + generationRules: + - resourceTypes: + - gcp_dms_migration_jobs + matchRules: + - type: pattern + pattern: ".+" + properties: ["project_id", "dms_location"] + mode: substring + slxs: + - baseName: gcp-dms-migration-health + qualifiers: ["project", "dms_location"] + baseTemplateName: gcp-dms-migration-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: gcp-dms-migration-health-taskset.yaml diff --git a/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-sli.yaml b/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-sli.yaml new file mode 100644 index 00000000..a5cf5042 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-sli.yaml @@ -0,0 +1,46 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: Health Score + displayUnitsShort: score + locations: + - {{default_location}} + description: Measures DMS migration health from job list state, recent operations, and CDC replication lag (0 unhealthy to 1 healthy). + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/gcp-dms-migration-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 180 + configProvided: + - name: GCP_PROJECT_ID + value: "{{ match_resource.resource.project_id }}" + - name: GCP_DMS_LOCATION + value: "{{ match_resource.resource.dms_location | default(custom.gcp_dms_location | default('us-central1')) }}" + - name: REPLICATION_LAG_SEC_THRESHOLD + value: "{{ custom.replication_lag_sec_threshold | default('300') }}" + secretsProvided: + {% if wb_version %} + {% include "gcp-auth.yaml" ignore missing %} + {% else %} + - name: gcp_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-slx.yaml b/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-slx.yaml new file mode 100644 index 00000000..98528293 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-slx.yaml @@ -0,0 +1,31 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/gcp/cloud_sql/cloud_sql.svg + alias: GCP DMS Migration Health for Project {{match_resource.resource.project_id}} + asMeasuredBy: Aggregate of DMS job health, recent operations, and CDC replication lag for the configured project and DMS region. + configProvided: + - name: SLX_PLACEHOLDER + value: SLX_PLACEHOLDER + owners: + - {{workspace.owner_email}} + statement: DMS migration jobs should progress without failures, operations should complete cleanly, and CDC lag should stay within thresholds before cutover. + additionalContext: + {% include "gcp-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "gcp-tags.yaml" ignore missing %} + - name: cloud + value: gcp + - name: service + value: database_migration_service + - name: scope + value: project + - name: access + value: read-only diff --git a/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-taskset.yaml b/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-taskset.yaml new file mode 100644 index 00000000..add498c3 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/.runwhen/templates/gcp-dms-migration-health-taskset.yaml @@ -0,0 +1,47 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Monitors Google Cloud Database Migration Service jobs for failures, stuck states, operation errors, and CDC replication lag. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/gcp-dms-migration-health/runbook.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: GCP_PROJECT_ID + value: "{{ match_resource.resource.project_id }}" + - name: GCP_DMS_LOCATION + value: "{{ match_resource.resource.dms_location | default(custom.gcp_dms_location | default('us-central1')) }}" + - name: DMS_JOB_NAMES + value: "{{ custom.dms_job_names | default('All') }}" + - name: REPLICATION_LAG_SEC_THRESHOLD + value: "{{ custom.replication_lag_sec_threshold | default('300') }}" + - name: REPLICATION_LAG_BYTES_THRESHOLD + value: "{{ custom.replication_lag_bytes_threshold | default('0') }}" + - name: DMS_STUCK_MINUTES + value: "{{ custom.dms_stuck_minutes | default('120') }}" + - name: DMS_OPERATION_STUCK_MINUTES + value: "{{ custom.dms_operation_stuck_minutes | default('45') }}" + - name: DMS_OPERATION_LIMIT + value: "{{ custom.dms_operation_limit | default('50') }}" + - name: DMS_LOG_LOOKBACK + value: "{{ custom.dms_log_lookback | default('1h') }}" + secretsProvided: + - name: gcp_credentials + workspaceKey: {{custom.gcp_ops_suite_sa}} diff --git a/codebundles/gcp-dms-migration-health/.test/Taskfile.yaml b/codebundles/gcp-dms-migration-health/.test/Taskfile.yaml new file mode 100644 index 00000000..ebbfa2fe --- /dev/null +++ b/codebundles/gcp-dms-migration-health/.test/Taskfile.yaml @@ -0,0 +1,24 @@ +version: "3" + +tasks: + default: + desc: "Validate CodeBundle structure" + cmds: + - task: validate-structure + + validate-structure: + desc: "Verify runbook, SLI, and RunWhen templates exist" + cmds: + - bash validate-bundle-structure.sh + dir: . + silent: false + + clean: + desc: "No cloud resources to tear down for this bundle" + cmds: + - 'echo "No-op: GCP DMS bundle uses client-side validation only."' + + build-infra: + desc: "No Terraform for this bundle; use a real GCP project for integration tests" + cmds: + - 'echo "Skipped: provision DMS test jobs manually in GCP if needed."' diff --git a/codebundles/gcp-dms-migration-health/.test/validate-bundle-structure.sh b/codebundles/gcp-dms-migration-health/.test/validate-bundle-structure.sh new file mode 100755 index 00000000..3f283542 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/.test/validate-bundle-structure.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Validates required CodeBundle files for gcp-dms-migration-health (CI / local). +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +test -f "$ROOT/runbook.robot" +test -f "$ROOT/sli.robot" +test -f "$ROOT/.runwhen/generation-rules/gcp-dms-migration-health.yaml" +test -f "$ROOT/.runwhen/templates/gcp-dms-migration-health-slx.yaml" +test -f "$ROOT/.runwhen/templates/gcp-dms-migration-health-taskset.yaml" +test -f "$ROOT/.runwhen/templates/gcp-dms-migration-health-sli.yaml" +echo "gcp-dms-migration-health bundle structure OK" diff --git a/codebundles/gcp-dms-migration-health/README.md b/codebundles/gcp-dms-migration-health/README.md new file mode 100644 index 00000000..21687eec --- /dev/null +++ b/codebundles/gcp-dms-migration-health/README.md @@ -0,0 +1,58 @@ +# GCP Database Migration Service (DMS) Health + +This CodeBundle monitors Google Cloud Database Migration Service (DMS) migration jobs for failed or stuck states, surfaces recent asynchronous operation failures, and evaluates CDC replication lag using Cloud Monitoring metrics (`migration_job/max_replica_sec_lag` and optionally `migration_job/max_replica_bytes_lag` on resource type `datamigration.googleapis.com/MigrationJob`). It helps confirm migrations are progressing and cutover-ready. + +## Overview + +- **Migration jobs**: Lists jobs with `gcloud database-migration migration-jobs list` using `--region` set from `GCP_DMS_LOCATION` (see [gcloud reference](https://cloud.google.com/sdk/gcloud/reference/database-migration/)). Flags terminal failures, long-lived transitional states, paused/cancelled jobs, and RUNNING jobs that remain outside CDC beyond a time threshold when continuous replication is expected. +- **Operations**: Lists recent DMS operations and raises issues on operation-level errors and long-running incomplete operations. +- **Replication lag**: For jobs in CDC phase, reads Cloud Monitoring time series and compares lag to `REPLICATION_LAG_SEC_THRESHOLD` and optional byte lag. Google documents that samples can appear in Monitoring up to about **180 seconds** after the observation window. +- **Describe**: Runs `gcloud database-migration migration-jobs describe` for jobs you name explicitly or that prior tasks flagged. +- **Logs**: Optionally correlates Cloud Logging entries for `datamigration.googleapis.com` when unhealthy jobs were flagged. + +## Configuration + +### Required Variables + +- `GCP_PROJECT_ID`: GCP project ID that contains the DMS migration jobs. +- `GCP_DMS_LOCATION`: DMS location ID passed to `gcloud database-migration ... --region` (for example `us-central1`). + +### Optional Variables + +- `DMS_JOB_NAMES`: Comma-separated migration job IDs to scope listing and describe logic, or `All` for every job in the region (default: `All`). +- `REPLICATION_LAG_SEC_THRESHOLD`: Seconds; alert when `max_replica_sec_lag` exceeds this value during CDC (default: `300`). +- `REPLICATION_LAG_BYTES_THRESHOLD`: Bytes; set to `0` to disable byte-lag issues (default: `0`). +- `DMS_STUCK_MINUTES`: Minutes a job may remain in a transitional state (or RUNNING outside CDC) before stuck-style issues (default: `120`). +- `DMS_OPERATION_STUCK_MINUTES`: Minutes an incomplete operation may run before it is treated as stuck (default: `45`). +- `DMS_OPERATION_LIMIT`: Maximum operations returned by `gcloud database-migration operations list` (default: `50`). +- `DMS_LOG_LOOKBACK`: Freshness window for optional error log correlation (for example `1h` or `30m`) (default: `1h`). + +### Secrets + +- `gcp_credentials`: Service account JSON key. Typical roles include `roles/datamigration.viewer`, `roles/monitoring.viewer`, and `roles/logging.viewer` for list/describe, time series, and log read access. + +## Tasks Overview + +### List DMS Migration Jobs and Flag Unhealthy States for `${GCP_PROJECT_ID}` + +Builds a summary table, writes structured issues for failed/cancelled/paused jobs, stuck transitional states, and delayed progression to CDC, and records flagged job IDs for follow-on tasks. + +### List Recent DMS Operations and Flag Failures for `${GCP_PROJECT_ID}` + +Surfaces failed operations and operations that stay incomplete beyond `DMS_OPERATION_STUCK_MINUTES`, and appends related job IDs to the shared flag list when identifiers appear in operation metadata. + +### Report DMS Replication Lag from Cloud Monitoring for `${GCP_PROJECT_ID}` + +Evaluates CDC jobs only for lag alerting. Skips lag evaluation when no jobs are in CDC (for example during full dump), which is expected per Google guidance. + +### Summarize DMS Migration Job Details for Flagged Jobs in `${GCP_PROJECT_ID}` + +Describes targets from `DMS_JOB_NAMES` when not `All`, otherwise describes jobs accumulated in the flag file from earlier tasks. + +### Optional Error Log Correlation for DMS in `${GCP_PROJECT_ID}` + +Runs a bounded Cloud Logging query when the flag file is non-empty; otherwise no-ops. + +## SLI + +`sli.robot` publishes a 0–1 score as the mean of binary dimensions: healthy job list (no FAILED/CANCELLED), operations without errors, and replication lag under threshold when CDC jobs exist. diff --git a/codebundles/gcp-dms-migration-health/describe-migration-jobs.sh b/codebundles/gcp-dms-migration-health/describe-migration-jobs.sh new file mode 100755 index 00000000..f783a03f --- /dev/null +++ b/codebundles/gcp-dms-migration-health/describe-migration-jobs.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Describes selected migration jobs for diagnostics. Targets come from DMS_JOB_NAMES +# or dms_flagged_jobs.txt (when DMS_JOB_NAMES is All). +# Output: describe_migration_jobs_issues.json, human summary on stdout +# ----------------------------------------------------------------------------- + +: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}" +: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}" + +OUTPUT_FILE="describe_migration_jobs_issues.json" +FLAG_FILE="dms_flagged_jobs.txt" +DMS_JOB_NAMES="${DMS_JOB_NAMES:-All}" + +issues_json='[]' + +gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" + +declare -a TARGETS=() + +if [ "${DMS_JOB_NAMES}" != "All" ]; then + IFS=',' read -ra parts <<<"${DMS_JOB_NAMES}" + for p in "${parts[@]}"; do + p="${p#"${p%%[![:space:]]*}"}" + p="${p%"${p##*[![:space:]]}"}" + [ -n "$p" ] && TARGETS+=("$p") + done +else + if [ -f "$FLAG_FILE" ]; then + while IFS= read -r line; do + line="${line#"${line%%[![:space:]]*}"}" + line="${line%"${line##*[![:space:]]}"}" + [ -z "$line" ] && continue + TARGETS+=("$line") + done < <(sort -u "$FLAG_FILE") + fi +fi + +if [ ${#TARGETS[@]} -eq 0 ]; then + echo "No migration job IDs to describe (set DMS_JOB_NAMES or run prior health tasks to populate ${FLAG_FILE})." + echo '[]' >"$OUTPUT_FILE" + exit 0 +fi + +summary="=== DMS migration job describe (${GCP_DMS_LOCATION}) ==="$'\n' + +for jid in "${TARGETS[@]}"; do + if ! desc=$(gcloud database-migration migration-jobs describe "${jid}" \ + --project="${GCP_PROJECT_ID}" \ + --region="${GCP_DMS_LOCATION}" \ + --format=json 2>err.log); then + err_msg=$(cat err.log || true) + rm -f err.log + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot describe DMS migration job \`${jid}\`" \ + --arg details "describe failed: ${err_msg}" \ + --arg severity "3" \ + --arg next_steps "Verify job ID, region, and IAM datamigration.migrationJobs.get." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + continue + fi + rm -f err.log + + summary+=$'---\n'"${jid}"$'\n' + summary+=$(echo "$desc" | jq -r '"state: \(.state // "n/a") phase: \(.phase // "n/a")"' 2>/dev/null || echo "$desc")$'\n' + + err_block=$(echo "$desc" | jq -c '.error // empty' 2>/dev/null || echo "{}") + if [ "$err_block" != "{}" ] && [ -n "$err_block" ] && [ "$err_block" != "null" ]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS migration job \`${jid}\` describe shows an error block" \ + --arg details "$(echo "$desc" | jq -c .)" \ + --arg severity "4" \ + --arg next_steps "Resolve the reported error: check connectivity, credentials, and engine-specific prerequisites." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi +done + +echo "$issues_json" >"$OUTPUT_FILE" +echo "${summary}" +echo "Wrote ${OUTPUT_FILE}" diff --git a/codebundles/gcp-dms-migration-health/fetch-dms-error-logs.sh b/codebundles/gcp-dms-migration-health/fetch-dms-error-logs.sh new file mode 100755 index 00000000..2c8dd516 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/fetch-dms-error-logs.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Bounded Cloud Logging query for DMS / datamigration when unhealthy jobs exist. +# No-op when dms_flagged_jobs.txt is empty. +# Env: GCP_PROJECT_ID, GCP_DMS_LOCATION, DMS_LOG_LOOKBACK +# Output: fetch_dms_error_logs_issues.json (usually empty; issues if critical errors found) +# ----------------------------------------------------------------------------- + +: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}" +: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}" + +OUTPUT_FILE="fetch_dms_error_logs_issues.json" +FLAG_FILE="dms_flagged_jobs.txt" +DMS_LOG_LOOKBACK="${DMS_LOG_LOOKBACK:-1h}" + +issues_json='[]' + +gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" + +if [ ! -s "$FLAG_FILE" ]; then + echo "No flagged DMS jobs; skipping error log correlation (healthy or not yet evaluated)." + echo '[]' >"$OUTPUT_FILE" + exit 0 +fi + +# Broad DMS-related errors in project (read-only) +filter='(protoPayload.serviceName="datamigration.googleapis.com" OR resource.type="datamigration.googleapis.com/MigrationJob") AND severity>=ERROR' + +if ! logs_out=$(gcloud logging read "${filter}" \ + --project="${GCP_PROJECT_ID}" \ + --freshness="${DMS_LOG_LOOKBACK}" \ + --limit=50 \ + --format=json 2>err.log); then + err_msg=$(cat err.log || true) + rm -f err.log + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot query Cloud Logging for DMS errors" \ + --arg details "gcloud logging read failed: ${err_msg}" \ + --arg severity "2" \ + --arg next_steps "Grant logging.logEntries.list (roles/logging.viewer) and retry." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi +rm -f err.log + +count=$(echo "$logs_out" | jq 'length' 2>/dev/null || echo "0") +if [ "${count}" -gt 0 ] 2>/dev/null; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Recent DMS-related ERROR logs found in \`${GCP_PROJECT_ID}\`" \ + --arg details "Count=${count} (lookback ${DMS_LOG_LOOKBACK}). Sample entries: $(echo "$logs_out" | jq -c '.[0:3]')" \ + --arg severity "2" \ + --arg next_steps "Triage entries below; correlate with flagged migration jobs and operations." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +fi + +echo "$issues_json" >"$OUTPUT_FILE" + +echo "=== DMS-related error logs (freshness ${DMS_LOG_LOOKBACK}, limit 50) ===" +gcloud logging read "${filter}" \ + --project="${GCP_PROJECT_ID}" \ + --freshness="${DMS_LOG_LOOKBACK}" \ + --limit=20 \ + --format="table[box](timestamp,severity,logName)" || true + +echo "Wrote ${OUTPUT_FILE}" diff --git a/codebundles/gcp-dms-migration-health/fetch-dms-replication-lag-metrics.sh b/codebundles/gcp-dms-migration-health/fetch-dms-replication-lag-metrics.sh new file mode 100755 index 00000000..f351799d --- /dev/null +++ b/codebundles/gcp-dms-migration-health/fetch-dms-replication-lag-metrics.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Reads migration_job/max_replica_sec_lag and optionally max_replica_bytes_lag from Monitoring. +# Only evaluates jobs in CDC phase when present in migration_jobs_list.json. +# Env: GCP_PROJECT_ID, GCP_DMS_LOCATION, REPLICATION_LAG_SEC_THRESHOLD, REPLICATION_LAG_BYTES_THRESHOLD +# Output: fetch_dms_replication_lag_issues.json, appends lag-hot jobs to dms_flagged_jobs.txt +# Note: Cloud Monitoring samples may lag observation by up to ~180s per Google documentation. +# ----------------------------------------------------------------------------- + +: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}" +: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}" + +OUTPUT_FILE="fetch_dms_replication_lag_issues.json" +JOBS_FILE="migration_jobs_list.json" +FLAG_FILE="dms_flagged_jobs.txt" + +REPLICATION_LAG_SEC_THRESHOLD="${REPLICATION_LAG_SEC_THRESHOLD:-300}" +REPLICATION_LAG_BYTES_THRESHOLD="${REPLICATION_LAG_BYTES_THRESHOLD:-0}" + +issues_json='[]' +touch "$FLAG_FILE" + +append_flag() { + local id="$1" + grep -qxF "$id" "$FLAG_FILE" 2>/dev/null || echo "$id" >>"$FLAG_FILE" +} + +gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" + +if [ ! -f "$JOBS_FILE" ]; then + echo "[]" >"$JOBS_FILE" +fi + +# Job IDs in CDC only (lag metrics meaningful; exclude dump / non-CDC phases) +cdc_jobs=$(jq -r ' + [.[]? | select(.state == "RUNNING") | select((.phase // "") == "CDC")] + | map(.name | split("/") | .[-1]) + | unique | .[] +' "$JOBS_FILE" 2>/dev/null || true) + +if [ -z "$(echo "$cdc_jobs" | tr -d '[:space:]')" ]; then + echo "No RUNNING jobs in CDC phase; skipping replication lag checks (normal during full dump / non-CDC work)." + echo '[]' >"$OUTPUT_FILE" + exit 0 +fi + +END=$(date -u +%Y-%m-%dT%H:%M:%SZ) +START=$(date -u -d '2 hours ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-2H +%Y-%m-%dT%H:%M:%SZ) + +if ! sec_series=$(gcloud monitoring time-series list \ + --project="${GCP_PROJECT_ID}" \ + --filter="metric.type=\"datamigration.googleapis.com/migration_job/max_replica_sec_lag\" AND resource.labels.location=\"${GCP_DMS_LOCATION}\"" \ + --interval-start-time="${START}" \ + --interval-end-time="${END}" \ + --format=json 2>err.log); then + err_msg=$(cat err.log || true) + rm -f err.log + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot read DMS replication lag (seconds) from Cloud Monitoring" \ + --arg details "gcloud monitoring time-series list failed: ${err_msg}" \ + --arg severity "3" \ + --arg next_steps "Grant monitoring.timeSeries.list (roles/monitoring.viewer) and confirm metric types for your engine." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi +rm -f err.log + +# Parse latest point per migration_job_id label +while IFS= read -r row; do + [ -z "$row" ] && continue + jid=$(echo "$row" | jq -r '.resource.labels.migration_job_id // empty') + [ -z "$jid" ] && continue + # Only evaluate jobs we care about (subset of project jobs) + if ! echo "$cdc_jobs" | grep -qxF "$jid" 2>/dev/null; then + continue + fi + val=$(echo "$row" | jq -r ' + [ .points[]? | .value.doubleValue // .value.int64Value // empty ] | last // empty + ') + if [ -z "$val" ] || [ "$val" = "null" ]; then + continue + fi + # Compare — use awk for float + if awk -v v="$val" -v t="$REPLICATION_LAG_SEC_THRESHOLD" 'BEGIN{exit !(v>t)}'; then + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "High DMS replication lag (seconds) for job \`${jid}\`" \ + --arg details "max_replica_sec_lag=${val}s (threshold ${REPLICATION_LAG_SEC_THRESHOLD}s). Monitoring samples may trail real time by up to ~180s." \ + --arg severity "3" \ + --arg next_steps "Before cutover, reduce lag; check source load, network, and DMS CDC health. See migration job metrics documentation." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi +done < <(echo "$sec_series" | jq -c '.[]') + +if [ "${REPLICATION_LAG_BYTES_THRESHOLD}" != "0" ] && [ -n "${REPLICATION_LAG_BYTES_THRESHOLD}" ]; then + if byte_series=$(gcloud monitoring time-series list \ + --project="${GCP_PROJECT_ID}" \ + --filter="metric.type=\"datamigration.googleapis.com/migration_job/max_replica_bytes_lag\" AND resource.labels.location=\"${GCP_DMS_LOCATION}\"" \ + --interval-start-time="${START}" \ + --interval-end-time="${END}" \ + --format=json 2>/dev/null); then + while IFS= read -r row; do + [ -z "$row" ] && continue + jid=$(echo "$row" | jq -r '.resource.labels.migration_job_id // empty') + [ -z "$jid" ] && continue + if ! echo "$cdc_jobs" | grep -qxF "$jid" 2>/dev/null; then + continue + fi + val=$(echo "$row" | jq -r '[ .points[]? | .value.doubleValue // .value.int64Value // empty ] | last // empty') + if [ -z "$val" ]; then + continue + fi + if awk -v v="$val" -v t="$REPLICATION_LAG_BYTES_THRESHOLD" 'BEGIN{exit !(v>t)}'; then + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "High DMS replication lag (bytes) for job \`${jid}\`" \ + --arg details "max_replica_bytes_lag=${val} (threshold ${REPLICATION_LAG_BYTES_THRESHOLD})." \ + --arg severity "2" \ + --arg next_steps "Investigate backlog size and destination apply rate before promotion." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi + done < <(echo "$byte_series" | jq -c '.[]') + fi +fi + +echo "$issues_json" >"$OUTPUT_FILE" +echo "=== Replication lag check complete (sec threshold=${REPLICATION_LAG_SEC_THRESHOLD}) ===" +echo "Wrote ${OUTPUT_FILE}" diff --git a/codebundles/gcp-dms-migration-health/list-dms-operations.sh b/codebundles/gcp-dms-migration-health/list-dms-operations.sh new file mode 100755 index 00000000..63b65e21 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/list-dms-operations.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Lists recent DMS operations; flags failures, cancellations, and long-running ops. +# Env: GCP_PROJECT_ID, GCP_DMS_LOCATION, DMS_OPERATION_STUCK_MINUTES, DMS_OPERATION_LIMIT +# Appends related job IDs to dms_flagged_jobs.txt +# Output: list_dms_operations_issues.json +# ----------------------------------------------------------------------------- + +: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}" +: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}" + +OUTPUT_FILE="list_dms_operations_issues.json" +FLAG_FILE="dms_flagged_jobs.txt" +DMS_OPERATION_STUCK_MINUTES="${DMS_OPERATION_STUCK_MINUTES:-45}" +DMS_OPERATION_LIMIT="${DMS_OPERATION_LIMIT:-50}" + +issues_json='[]' +touch "$FLAG_FILE" + +append_flag() { + local id="$1" + [ -z "$id" ] || [ "$id" = "null" ] && return + grep -qxF "$id" "$FLAG_FILE" 2>/dev/null || echo "$id" >>"$FLAG_FILE" +} + +extract_job_from_metadata() { + echo "$1" | jq -c . 2>/dev/null | grep -oE 'migrationJobs/[a-zA-Z0-9][a-zA-Z0-9_-]*' | head -1 | cut -d/ -f2 || true +} + +gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" + +if ! ops_raw=$(gcloud database-migration operations list \ + --project="${GCP_PROJECT_ID}" \ + --region="${GCP_DMS_LOCATION}" \ + --limit="${DMS_OPERATION_LIMIT}" \ + --format=json 2>err.log); then + err_msg=$(cat err.log || true) + rm -f err.log + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot list DMS operations in \`${GCP_PROJECT_ID}\` (${GCP_DMS_LOCATION})" \ + --arg details "gcloud database-migration operations list failed: ${err_msg}" \ + --arg severity "3" \ + --arg next_steps "Verify datamigration.operations.list permission and region." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi +rm -f err.log + +now_epoch=$(date -u +%s) +stuck_sec=$((DMS_OPERATION_STUCK_MINUTES * 60)) + +while IFS= read -r op; do + [ -z "$op" ] && continue + done_flag=$(echo "$op" | jq -r '.done // false') + err=$(echo "$op" | jq -c '.error // empty') + name=$(echo "$op" | jq -r '.name // ""') + jid=$(extract_job_from_metadata "$op") + + if [ "$err" != "{}" ] && [ -n "$err" ]; then + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS operation reported an error (${name##*/})" \ + --arg details "$(echo "$op" | jq -c .)" \ + --arg severity "3" \ + --arg next_steps "Correlate with migration job ${jid:-unknown}: review job describe output and Cloud Logging for datamigration.googleapis.com." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi + + if [ "$done_flag" = "false" ]; then + st=$(echo "$op" | jq -r '.metadata.startTime // .metadata["@type"] // empty' 2>/dev/null || echo "") + # Some operations expose start via metadata + st=$(echo "$op" | jq -r '.. | .startTime? // empty' | head -1) + se="" + if [ -n "$st" ] && [ "$st" != "null" ]; then + se=$(date -d "$st" +%s 2>/dev/null || echo "") + fi + if [ -n "$se" ] && [ $((now_epoch - se)) -gt "$stuck_sec" ]; then + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS operation may be stuck (not done): ${name##*/}" \ + --arg details "Running longer than ${DMS_OPERATION_STUCK_MINUTES}m. $(echo "$op" | jq -c .)" \ + --arg severity "2" \ + --arg next_steps "Check migration job progress, quotas, and network paths; cancel/retry per Google DMS guidance if appropriate." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi + fi +done < <(echo "$ops_raw" | jq -c '.[]') + +echo "$issues_json" >"$OUTPUT_FILE" + +echo "=== Recent DMS operations (${GCP_DMS_LOCATION}, limit ${DMS_OPERATION_LIMIT}) ===" +gcloud database-migration operations list \ + --project="${GCP_PROJECT_ID}" \ + --region="${GCP_DMS_LOCATION}" \ + --limit="${DMS_OPERATION_LIMIT}" \ + --format="table[box](name,done)" || true + +echo "Wrote ${OUTPUT_FILE}" diff --git a/codebundles/gcp-dms-migration-health/list-migration-jobs.sh b/codebundles/gcp-dms-migration-health/list-migration-jobs.sh new file mode 100755 index 00000000..4ec6760f --- /dev/null +++ b/codebundles/gcp-dms-migration-health/list-migration-jobs.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Lists DMS migration jobs, evaluates unhealthy / stuck states, writes issues JSON. +# Env: GCP_PROJECT_ID, GCP_DMS_LOCATION, DMS_JOB_NAMES, DMS_STUCK_MINUTES +# Outputs: list_migration_jobs_issues.json, migration_jobs_list.json, dms_flagged_jobs.txt +# ----------------------------------------------------------------------------- + +: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}" +: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}" + +OUTPUT_FILE="list_migration_jobs_issues.json" +JOBS_FILE="migration_jobs_list.json" +FLAG_FILE="dms_flagged_jobs.txt" + +DMS_JOB_NAMES="${DMS_JOB_NAMES:-All}" +DMS_STUCK_MINUTES="${DMS_STUCK_MINUTES:-120}" + +issues_json='[]' + +auth_gcloud() { + gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" +} + +short_name() { + local full="$1" + echo "${full##*/}" +} + +iso_to_epoch() { + local iso="$1" + if [ -z "$iso" ] || [ "$iso" = "null" ]; then + echo "" + return + fi + date -d "$iso" +%s 2>/dev/null || date -d "${iso/Z/+0000}" +%s 2>/dev/null || echo "" +} + +append_flag() { + local id="$1" + grep -qxF "$id" "$FLAG_FILE" 2>/dev/null || echo "$id" >>"$FLAG_FILE" +} + +rm -f "$FLAG_FILE" +touch "$FLAG_FILE" + +if ! auth_gcloud; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot authenticate to GCP for DMS list" \ + --arg details "gcloud auth activate-service-account failed. Verify gcp_credentials secret." \ + --arg severity "4" \ + --arg next_steps "Confirm the service account JSON is valid and has datamigration.viewer (or equivalent)." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" >"$OUTPUT_FILE" + echo '[]' >"$JOBS_FILE" + exit 0 +fi + +if ! jobs_raw=$(gcloud database-migration migration-jobs list \ + --project="${GCP_PROJECT_ID}" \ + --region="${GCP_DMS_LOCATION}" \ + --format=json 2>err.log); then + err_msg=$(cat err.log || true) + rm -f err.log + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot list DMS migration jobs in \`${GCP_PROJECT_ID}\`" \ + --arg details "gcloud database-migration migration-jobs list failed: ${err_msg}" \ + --arg severity "4" \ + --arg next_steps "Verify Database Migration API is enabled, region is correct, and IAM allows datamigration.migrationJobs.list." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" >"$OUTPUT_FILE" + echo '[]' >"$JOBS_FILE" + exit 0 +fi +rm -f err.log + +echo "$jobs_raw" >"$JOBS_FILE" + +# Filter by DMS_JOB_NAMES when not All (comma-separated job IDs) +if [ "${DMS_JOB_NAMES}" != "All" ]; then + jobs_filtered=$(echo "$jobs_raw" | jq -c --arg csv "${DMS_JOB_NAMES}" ' + ($csv | split(",") | map(gsub("^ +| +$";""))) as $want | + [ .[] | select(.name != null) + | select((.name | split("/") | .[-1]) as $id | ($want | index($id) != null)) ] + ') +else + jobs_filtered="$jobs_raw" +fi + +now_epoch=$(date -u +%s) +stuck_sec=$((DMS_STUCK_MINUTES * 60)) + +while IFS= read -r job_json; do + [ -z "$job_json" ] && continue + state=$(echo "$job_json" | jq -r '.state // "UNKNOWN"') + phase=$(echo "$job_json" | jq -r '.phase // empty') + full_name=$(echo "$job_json" | jq -r '.name // ""') + jid=$(short_name "$full_name") + ut=$(echo "$job_json" | jq -r '.updateTime // .createTime // empty') + ue=$(iso_to_epoch "$ut") + + case "$state" in + FAILED) + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS migration job \`${jid}\` is FAILED" \ + --arg details "$(echo "$job_json" | jq -c .)" \ + --arg severity "4" \ + --arg next_steps "Run describe on the job, review Cloud Logging for datamigration.googleapis.com, and follow DMS troubleshooting for your engine." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + ;; + PAUSED) + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS migration job \`${jid}\` is PAUSED" \ + --arg details "$(echo "$job_json" | jq -c .)" \ + --arg severity "2" \ + --arg next_steps "Confirm whether pause is intentional; if not, resume or investigate blocking errors in the job details." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + ;; + CANCELLED) + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS migration job \`${jid}\` is CANCELLED" \ + --arg details "$(echo "$job_json" | jq -c .)" \ + --arg severity "3" \ + --arg next_steps "If cancellation was unexpected, create a new migration job or restore from backup per your runbook." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + ;; + esac + + # Stuck in transitional states + if [[ "$state" =~ ^(CREATING|UPDATING|DELETING|STARTING|RESTARTING|VERIFYING)$ ]]; then + if [ -n "$ue" ] && [ $((now_epoch - ue)) -gt "$stuck_sec" ]; then + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS migration job \`${jid}\` may be stuck in ${state}" \ + --arg details "State=${state}, last update ${ut}, threshold ${DMS_STUCK_MINUTES}m. Raw: $(echo "$job_json" | jq -c .)" \ + --arg severity "3" \ + --arg next_steps "Inspect operations for this job, check VPC connectivity and IAM, and open a support case if the state does not progress." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi + fi + + # Continuous replication expectation: RUNNING should eventually reach CDC for homogeneous PG/MySQL continuous jobs + if [ "$state" = "RUNNING" ] && [ -n "$phase" ] && [ "$phase" != "CDC" ] && [ "$phase" != "PHASE_UNSPECIFIED" ]; then + if [ -n "$ue" ] && [ $((now_epoch - ue)) -gt "$stuck_sec" ]; then + append_flag "$jid" + issues_json=$(echo "$issues_json" | jq \ + --arg title "DMS migration job \`${jid}\` RUNNING but not in CDC phase (${phase})" \ + --arg details "Job remains in phase ${phase} beyond ${DMS_STUCK_MINUTES}m since last update. $(echo "$job_json" | jq -c .)" \ + --arg severity "2" \ + --arg next_steps "If you require CDC / cutover readiness, wait for CDC or investigate errors in job details and logs." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi + fi +done < <(echo "$jobs_filtered" | jq -c '.[]') + +echo "$issues_json" >"$OUTPUT_FILE" + +echo "=== DMS migration jobs (${GCP_DMS_LOCATION}) ===" +gcloud database-migration migration-jobs list \ + --project="${GCP_PROJECT_ID}" \ + --region="${GCP_DMS_LOCATION}" \ + --format="table[box](name,state,phase,updateTime)" || true + +echo "Wrote ${OUTPUT_FILE}, ${JOBS_FILE}, ${FLAG_FILE}" diff --git a/codebundles/gcp-dms-migration-health/runbook.robot b/codebundles/gcp-dms-migration-health/runbook.robot new file mode 100644 index 00000000..9cef9ec9 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/runbook.robot @@ -0,0 +1,293 @@ +*** Settings *** +Documentation Monitors Google Cloud Database Migration Service migration jobs for failed or stuck states, operation failures, and CDC replication lag using gcloud and Cloud Monitoring. +Metadata Author rw-codebundle-agent +Metadata Display Name GCP Database Migration Service (DMS) Health +Metadata Supports GCP DMS Database Migration Replication CDC + +Library BuiltIn +Library String +Library RW.Core +Library RW.CLI +Library RW.platform + +Force Tags GCP DMS Migration Health + +Suite Setup Suite Initialization + + +*** Tasks *** +List DMS Migration Jobs and Flag Unhealthy States for `${GCP_PROJECT_ID}` + [Documentation] Lists migration jobs in the DMS region and raises issues for failed, paused, cancelled, stuck transitional states, or RUNNING jobs not yet in CDC beyond the stuck threshold. + [Tags] GCP DMS migration-jobs access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=list-migration-jobs.sh + ... env=${env} + ... secret_file__gcp_credentials=${gcp_credentials} + ... timeout_seconds=240 + ... show_in_rwl_cheatsheet=true + ... cmd_override=./list-migration-jobs.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat list_migration_jobs_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for migration job list issues; defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=DMS migration jobs should not be failed, stuck indefinitely, or blocked before CDC when continuous replication is required. + ... actual=One or more migration jobs in `${GCP_DMS_LOCATION}` need attention. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report DMS migration job analysis (region `${GCP_DMS_LOCATION}`): + RW.Core.Add Pre To Report ${result.stdout} + +List Recent DMS Operations and Flag Failures for `${GCP_PROJECT_ID}` + [Documentation] Lists recent DMS operations in the region and surfaces operation errors and long-running incomplete operations. + [Tags] GCP DMS operations access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=list-dms-operations.sh + ... env=${env} + ... secret_file__gcp_credentials=${gcp_credentials} + ... timeout_seconds=240 + ... show_in_rwl_cheatsheet=false + ... cmd_override=./list-dms-operations.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat list_dms_operations_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for DMS operations issues; defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=DMS operations should complete without errors and should not remain pending indefinitely. + ... actual=An operation in `${GCP_DMS_LOCATION}` failed or appears stuck. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report DMS operations listing: + RW.Core.Add Pre To Report ${result.stdout} + +Report DMS Replication Lag from Cloud Monitoring for `${GCP_PROJECT_ID}` + [Documentation] Reads Cloud Monitoring metrics for CDC migration jobs and flags replication lag above configured thresholds (samples may trail by up to ~180s). + [Tags] GCP DMS monitoring CDC access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=fetch-dms-replication-lag-metrics.sh + ... env=${env} + ... secret_file__gcp_credentials=${gcp_credentials} + ... timeout_seconds=240 + ... show_in_rwl_cheatsheet=false + ... cmd_override=./fetch-dms-replication-lag-metrics.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat fetch_dms_replication_lag_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for replication lag issues; defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=CDC replication lag should stay below configured thresholds before cutover. + ... actual=Replication lag metrics indicate the destination is too far behind the source for at least one job. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report DMS replication lag (Monitoring): + RW.Core.Add Pre To Report ${result.stdout} + +Summarize DMS Migration Job Details for Flagged Jobs in `${GCP_PROJECT_ID}` + [Documentation] Describes migration jobs selected via DMS_JOB_NAMES or jobs flagged by earlier tasks to capture phase, errors, and configuration context. + [Tags] GCP DMS describe access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=describe-migration-jobs.sh + ... env=${env} + ... secret_file__gcp_credentials=${gcp_credentials} + ... timeout_seconds=240 + ... show_in_rwl_cheatsheet=false + ... cmd_override=./describe-migration-jobs.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat describe_migration_jobs_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for describe issues; defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=DMS migration job describe output should not contain unresolved error payloads for healthy jobs. + ... actual=Describe output shows error details for a migration job in `${GCP_DMS_LOCATION}`. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report DMS migration job describe summary: + RW.Core.Add Pre To Report ${result.stdout} + +Optional Error Log Correlation for DMS in `${GCP_PROJECT_ID}` + [Documentation] When unhealthy jobs were flagged, queries Cloud Logging for recent DMS-related error entries to speed up triage. + [Tags] GCP DMS logging access:read-only data:logs-regexp + + ${result}= RW.CLI.Run Bash File + ... bash_file=fetch-dms-error-logs.sh + ... env=${env} + ... secret_file__gcp_credentials=${gcp_credentials} + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=false + ... cmd_override=./fetch-dms-error-logs.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat fetch_dms_error_logs_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for DMS error log issues; defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=No recent ERROR-level DMS logs when migrations are healthy. + ... actual=Recent DMS-related error log entries were found in the project. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report DMS error log correlation: + RW.Core.Add Pre To Report ${result.stdout} + + +*** Keywords *** +Suite Initialization + ${gcp_credentials}= RW.Core.Import Secret gcp_credentials + ... type=string + ... description=GCP service account JSON with read-only access to DMS, Monitoring, and Logging. + ... pattern=\w* + ${GCP_PROJECT_ID}= RW.Core.Import User Variable GCP_PROJECT_ID + ... type=string + ... description=GCP project ID that owns the DMS migration jobs. + ... pattern=\w* + ${GCP_DMS_LOCATION}= RW.Core.Import User Variable GCP_DMS_LOCATION + ... type=string + ... description=DMS regional location passed to gcloud --region (for example us-central1). + ... pattern=\w* + ${DMS_JOB_NAMES}= RW.Core.Import User Variable DMS_JOB_NAMES + ... type=string + ... description=Comma-separated migration job IDs, or All to evaluate every job in the location. + ... pattern=.* + ... default=All + ${REPLICATION_LAG_SEC_THRESHOLD}= RW.Core.Import User Variable REPLICATION_LAG_SEC_THRESHOLD + ... type=string + ... description=Alert when migration_job/max_replica_sec_lag exceeds this many seconds during CDC. + ... pattern=^\d+$ + ... default=300 + ${REPLICATION_LAG_BYTES_THRESHOLD}= RW.Core.Import User Variable REPLICATION_LAG_BYTES_THRESHOLD + ... type=string + ... description=Optional byte lag threshold; set 0 to disable bytes lag issues. + ... pattern=^\d+$ + ... default=0 + ${DMS_STUCK_MINUTES}= RW.Core.Import User Variable DMS_STUCK_MINUTES + ... type=string + ... description=Minutes in a transitional or non-CDC RUNNING phase before raising a stuck warning. + ... pattern=^\d+$ + ... default=120 + ${DMS_OPERATION_STUCK_MINUTES}= RW.Core.Import User Variable DMS_OPERATION_STUCK_MINUTES + ... type=string + ... description=Minutes an incomplete DMS operation may run before it is treated as stuck. + ... pattern=^\d+$ + ... default=45 + ${DMS_OPERATION_LIMIT}= RW.Core.Import User Variable DMS_OPERATION_LIMIT + ... type=string + ... description=Maximum operations returned by gcloud database-migration operations list. + ... pattern=^\d+$ + ... default=50 + ${DMS_LOG_LOOKBACK}= RW.Core.Import User Variable DMS_LOG_LOOKBACK + ... type=string + ... description=Logging freshness window for optional DMS error correlation (for example 1h or 30m). + ... pattern=\w+ + ... default=1h + ${PATH_VAL}= Get Environment Variable PATH + Set Suite Variable ${GCP_PROJECT_ID} ${GCP_PROJECT_ID} + Set Suite Variable ${GCP_DMS_LOCATION} ${GCP_DMS_LOCATION} + Set Suite Variable ${DMS_JOB_NAMES} ${DMS_JOB_NAMES} + Set Suite Variable ${REPLICATION_LAG_SEC_THRESHOLD} ${REPLICATION_LAG_SEC_THRESHOLD} + Set Suite Variable ${REPLICATION_LAG_BYTES_THRESHOLD} ${REPLICATION_LAG_BYTES_THRESHOLD} + Set Suite Variable ${DMS_STUCK_MINUTES} ${DMS_STUCK_MINUTES} + Set Suite Variable ${DMS_OPERATION_STUCK_MINUTES} ${DMS_OPERATION_STUCK_MINUTES} + Set Suite Variable ${DMS_OPERATION_LIMIT} ${DMS_OPERATION_LIMIT} + Set Suite Variable ${DMS_LOG_LOOKBACK} ${DMS_LOG_LOOKBACK} + Set Suite Variable ${gcp_credentials} ${gcp_credentials} + ${env}= Create Dictionary + ... GCP_PROJECT_ID=${GCP_PROJECT_ID} + ... GCP_DMS_LOCATION=${GCP_DMS_LOCATION} + ... DMS_JOB_NAMES=${DMS_JOB_NAMES} + ... REPLICATION_LAG_SEC_THRESHOLD=${REPLICATION_LAG_SEC_THRESHOLD} + ... REPLICATION_LAG_BYTES_THRESHOLD=${REPLICATION_LAG_BYTES_THRESHOLD} + ... DMS_STUCK_MINUTES=${DMS_STUCK_MINUTES} + ... DMS_OPERATION_STUCK_MINUTES=${DMS_OPERATION_STUCK_MINUTES} + ... DMS_OPERATION_LIMIT=${DMS_OPERATION_LIMIT} + ... DMS_LOG_LOOKBACK=${DMS_LOG_LOOKBACK} + ... CLOUDSDK_CORE_PROJECT=${GCP_PROJECT_ID} + ... GOOGLE_APPLICATION_CREDENTIALS=./${gcp_credentials.key} + ... PATH=${PATH_VAL} + Set Suite Variable ${env} ${env} diff --git a/codebundles/gcp-dms-migration-health/sli-dms-health.sh b/codebundles/gcp-dms-migration-health/sli-dms-health.sh new file mode 100755 index 00000000..e8e66d1d --- /dev/null +++ b/codebundles/gcp-dms-migration-health/sli-dms-health.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +set -euo pipefail +# ----------------------------------------------------------------------------- +# Lightweight SLI: writes sli_dms_scores.json with binary sub-scores (0/1) for aggregation. +# Uses gcloud database-migration list + monitoring lag sample. +# ----------------------------------------------------------------------------- + +: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}" +: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}" + +OUT="sli_dms_scores.json" +REPLICATION_LAG_SEC_THRESHOLD="${REPLICATION_LAG_SEC_THRESHOLD:-300}" + +gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" >/dev/null 2>&1 + +job_score=1 +ops_score=1 +lag_score=1 + +if ! jobs_raw=$(gcloud database-migration migration-jobs list \ + --project="${GCP_PROJECT_ID}" \ + --region="${GCP_DMS_LOCATION}" \ + --format=json 2>/dev/null); then + jq -n '{job_score:0, ops_score:0, lag_score:0, error:"list_failed"}' >"$OUT" + exit 0 +fi + +bad=$(echo "$jobs_raw" | jq '[.[] | select(.state == "FAILED" or .state == "CANCELLED")] | length') +if [ "${bad:-0}" -gt 0 ] 2>/dev/null; then + job_score=0 +fi + +if ! ops_raw=$(gcloud database-migration operations list \ + --project="${GCP_PROJECT_ID}" \ + --region="${GCP_DMS_LOCATION}" \ + --limit=30 \ + --format=json 2>/dev/null); then + ops_score=0 +else + op_err=$(echo "$ops_raw" | jq '[.[] | select(.error != null and (.error | type) == "object" and (.error | length) > 0)] | length') + if [ "${op_err:-0}" -gt 0 ] 2>/dev/null; then + ops_score=0 + fi +fi + +cdc=$(echo "$jobs_raw" | jq '[.[] | select(.state == "RUNNING") | select((.phase // "") == "CDC")] | length') +if [ "${cdc:-0}" -eq 0 ] 2>/dev/null; then + lag_score=1 +else + END=$(date -u +%Y-%m-%dT%H:%M:%SZ) + START=$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-1H +%Y-%m-%dT%H:%M:%SZ) + if ! sec_series=$(gcloud monitoring time-series list \ + --project="${GCP_PROJECT_ID}" \ + --filter="metric.type=\"datamigration.googleapis.com/migration_job/max_replica_sec_lag\" AND resource.labels.location=\"${GCP_DMS_LOCATION}\"" \ + --interval-start-time="${START}" \ + --interval-end-time="${END}" \ + --format=json 2>/dev/null); then + lag_score=1 + else + over=0 + while IFS= read -r row; do + [ -z "$row" ] && continue + val=$(echo "$row" | jq -r '[ .points[]? | .value.doubleValue // .value.int64Value // empty ] | last // empty') + [ -z "$val" ] || [ "$val" = "null" ] && continue + if awk -v v="$val" -v t="$REPLICATION_LAG_SEC_THRESHOLD" 'BEGIN{exit !(v>t)}'; then + over=1 + break + fi + done < <(echo "$sec_series" | jq -c '.[]') + lag_score=$((1 - over)) + fi +fi + +jq -n --argjson js "$job_score" --argjson os "$ops_score" --argjson ls "$lag_score" \ + '{job_score: $js, ops_score: $os, lag_score: $ls}' >"$OUT" diff --git a/codebundles/gcp-dms-migration-health/sli.robot b/codebundles/gcp-dms-migration-health/sli.robot new file mode 100644 index 00000000..24bd3692 --- /dev/null +++ b/codebundles/gcp-dms-migration-health/sli.robot @@ -0,0 +1,85 @@ +*** Settings *** +Documentation Measures DMS migration health using job state, recent operations, and CDC replication lag. Produces a value between 0 (failing) and 1 (healthy) as the mean of binary sub-scores. +Metadata Author rw-codebundle-agent +Metadata Display Name GCP DMS Migration Health SLI +Metadata Supports GCP DMS Database Migration + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library OperatingSystem + +Suite Setup Suite Initialization + + +*** Tasks *** +Score DMS Health for Project `${GCP_PROJECT_ID}` Region `${GCP_DMS_LOCATION}` + [Documentation] Runs a lightweight gcloud and Monitoring check to produce sub-metrics and an aggregate 0-1 health score. + [Tags] GCP DMS access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=sli-dms-health.sh + ... env=${env} + ... secret_file__gcp_credentials=${gcp_credentials} + ... timeout_seconds=60 + ... show_in_rwl_cheatsheet=false + ... cmd_override=./sli-dms-health.sh + + ${scores_raw}= RW.CLI.Run Cli + ... cmd=cat sli_dms_scores.json + ... env=${env} + ... include_in_history=false + + TRY + ${scores}= Evaluate json.loads(r'''${scores_raw.stdout}''') json + ${job_score}= Convert To Number ${scores['job_score']} + ${ops_score}= Convert To Number ${scores['ops_score']} + ${lag_score}= Convert To Number ${scores['lag_score']} + RW.Core.Push Metric ${job_score} sub_name=job_state + RW.Core.Push Metric ${ops_score} sub_name=operations + RW.Core.Push Metric ${lag_score} sub_name=replication_lag + ${health_score}= Evaluate (${job_score} + ${ops_score} + ${lag_score}) / 3 + ${health_score}= Convert to Number ${health_score} 3 + RW.Core.Add to Report DMS health score: ${health_score} (job=${job_score}, ops=${ops_score}, lag=${lag_score}) + RW.Core.Push Metric ${health_score} + EXCEPT + Log SLI score JSON parse failed; defaulting to zero health. WARN + ${health_score}= Convert To Number 0 + RW.Core.Add to Report DMS health score: ${health_score} (parse error) + RW.Core.Push Metric ${health_score} + END + + +*** Keywords *** +Suite Initialization + ${gcp_credentials}= RW.Core.Import Secret gcp_credentials + ... type=string + ... description=GCP service account JSON with read-only access to DMS and Monitoring. + ... pattern=\w* + ${GCP_PROJECT_ID}= RW.Core.Import User Variable GCP_PROJECT_ID + ... type=string + ... description=GCP project ID for DMS resources. + ... pattern=\w* + ${GCP_DMS_LOCATION}= RW.Core.Import User Variable GCP_DMS_LOCATION + ... type=string + ... description=DMS regional location (gcloud --region). + ... pattern=\w* + ${REPLICATION_LAG_SEC_THRESHOLD}= RW.Core.Import User Variable REPLICATION_LAG_SEC_THRESHOLD + ... type=string + ... description=Maximum acceptable CDC replication lag in seconds for the SLI lag dimension. + ... pattern=^\d+$ + ... default=300 + ${PATH_VAL}= Get Environment Variable PATH + Set Suite Variable ${GCP_PROJECT_ID} ${GCP_PROJECT_ID} + Set Suite Variable ${GCP_DMS_LOCATION} ${GCP_DMS_LOCATION} + Set Suite Variable ${REPLICATION_LAG_SEC_THRESHOLD} ${REPLICATION_LAG_SEC_THRESHOLD} + Set Suite Variable ${gcp_credentials} ${gcp_credentials} + ${env}= Create Dictionary + ... GCP_PROJECT_ID=${GCP_PROJECT_ID} + ... GCP_DMS_LOCATION=${GCP_DMS_LOCATION} + ... REPLICATION_LAG_SEC_THRESHOLD=${REPLICATION_LAG_SEC_THRESHOLD} + ... CLOUDSDK_CORE_PROJECT=${GCP_PROJECT_ID} + ... GOOGLE_APPLICATION_CREDENTIALS=./${gcp_credentials.key} + ... PATH=${PATH_VAL} + Set Suite Variable ${env} ${env}