diff --git a/codebundles/azure-cosmosdb-utilization-health/.runwhen/generation-rules/azure-cosmosdb-utilization-health.yaml b/codebundles/azure-cosmosdb-utilization-health/.runwhen/generation-rules/azure-cosmosdb-utilization-health.yaml new file mode 100644 index 00000000..6379c260 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.runwhen/generation-rules/azure-cosmosdb-utilization-health.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: azure + generationRules: + - resourceTypes: + - azure_cosmosdb_database_account + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + slxs: + - baseName: azure-cosmosdb-utilization-health + qualifiers: ["subscription_id", "resource_group"] + baseTemplateName: azure-cosmosdb-utilization-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: azure-cosmosdb-utilization-health-taskset.yaml diff --git a/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-sli.yaml b/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-sli.yaml new file mode 100644 index 00000000..118a5789 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-sli.yaml @@ -0,0 +1,54 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: Health Score + displayUnitsShort: score + locations: + - {{default_location}} + description: Composite 0-1 score from normalized RU headroom, HTTP 429 rate, and server-side latency for Cosmos DB {{ match_resource.name }}. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/azure-cosmosdb-utilization-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: AZ_SUBSCRIPTION + value: "{{ subscription_id }}" + - name: AZURE_RESOURCE_GROUP + value: "{{ resource_group.name }}" + - name: COSMOSDB_ACCOUNT_NAME + value: "{{ match_resource.name }}" + - name: NORMALIZED_RU_THRESHOLD_PCT + value: "80" + - name: THROTTLE_EVENTS_THRESHOLD + value: "1" + - name: SERVER_LATENCY_MS_THRESHOLD + value: "100" + - name: SLI_METRICS_OFFSET + value: "2d" + secretsProvided: + {% if wb_version %} + {% include "azure-auth.yaml" ignore missing %} + {% else %} + - name: azure_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-slx.yaml b/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-slx.yaml new file mode 100644 index 00000000..ac511d4f --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-slx.yaml @@ -0,0 +1,31 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{ slx_name }} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/azure/databases/10137-icon-service-Azure-Cosmos-DB.svg + alias: {{ match_resource.name }} Cosmos DB Utilization Health + asMeasuredBy: Azure Monitor metrics for normalized RU, 429 rate, latency, storage, and throughput sizing. + configProvided: + - name: SLX_PLACEHOLDER + value: SLX_PLACEHOLDER + owners: + - {{ workspace.owner_email }} + statement: Cosmos DB account {{ match_resource.name }} in resource group {{ resource_group.name }} should show healthy utilization without chronic throttling or mis-sized throughput. + additionalContext: + {% include "azure-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "azure-tags.yaml" ignore missing %} + - name: cloud + value: azure + - name: service + value: cosmosdb + - name: scope + value: resource-group + - name: access + value: read-only diff --git a/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-taskset.yaml b/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-taskset.yaml new file mode 100644 index 00000000..a902ef24 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.runwhen/templates/azure-cosmosdb-utilization-health-taskset.yaml @@ -0,0 +1,53 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Analyze Cosmos DB utilization metrics for {{ match_resource.name }} in resource group {{ resource_group.name }} (subscription {{ subscription_name }}). + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/azure-cosmosdb-utilization-health/runbook.robot + configProvided: + - name: AZ_SUBSCRIPTION + value: "{{ subscription_id }}" + - name: AZURE_RESOURCE_GROUP + value: "{{ resource_group.name }}" + - name: COSMOSDB_ACCOUNT_NAME + value: "{{ match_resource.name }}" + - name: METRICS_LOOKBACK_DAYS + value: "14" + - name: NORMALIZED_RU_THRESHOLD_PCT + value: "80" + - name: THROTTLE_EVENTS_THRESHOLD + value: "1" + - name: SERVER_LATENCY_MS_THRESHOLD + value: "100" + - name: STORAGE_GROWTH_PCT_THRESHOLD + value: "25" + - name: UNDERUTILIZED_NORMALIZED_PCT + value: "15" + - name: RU_DAILY_GROWTH_RATIO + value: "1.5" + - name: AZURE_SUBSCRIPTION_NAME + value: "{{ subscription_name }}" + secretsProvided: + {% if wb_version %} + {% include "azure-auth.yaml" ignore missing %} + {% else %} + - name: azure_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/README.md b/codebundles/azure-cosmosdb-utilization-health/.test/README.md new file mode 100644 index 00000000..ad37079e --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/README.md @@ -0,0 +1,8 @@ +### Testing `azure-cosmosdb-utilization-health` + +This directory holds optional validation and Terraform for a sample Cosmos DB account. + +1. **Quick validation**: From `.test`, run `task` (or `bash validate-all-tests.sh`) to `bash -n` all bundle scripts. +2. **Terraform**: Configure Azure credentials per your environment, then `task build-infra` to create a minimal Cosmos DB account in a resource group for live metric queries. Destroy with `task clean`. + +Do not commit secrets. Use `terraform.tfvars` locally for non-production subscriptions only. diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/Taskfile.yaml b/codebundles/azure-cosmosdb-utilization-health/.test/Taskfile.yaml new file mode 100644 index 00000000..0a3dee7b --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/Taskfile.yaml @@ -0,0 +1,35 @@ +version: "3" + +tasks: + default: + desc: "Validate bundle shell scripts and (optionally) Terraform format" + cmds: + - bash validate-all-tests.sh + - task: terraform-fmt-check + + terraform-fmt-check: + desc: "terraform fmt check (requires terraform in PATH)" + dir: terraform + cmds: + - | + if command -v terraform >/dev/null 2>&1; then + terraform fmt -check -recursive || terraform fmt -recursive + else + echo "terraform not installed; skipping fmt" + fi + + build-infra: + desc: "Provision test Cosmos DB account (optional; consumes Azure quota)" + dir: terraform + cmds: + - terraform init + - terraform apply -auto-approve + + clean: + desc: "Destroy test infrastructure" + dir: terraform + cmds: + - | + if [ -f terraform.tfstate ]; then + terraform destroy -auto-approve + fi diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/terraform/backend.tf b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/backend.tf new file mode 100644 index 00000000..3c533e6b --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "local" { + path = "terraform.tfstate" + } +} diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/terraform/main.tf b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/main.tf new file mode 100644 index 00000000..7576b7e9 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/main.tf @@ -0,0 +1,29 @@ +resource "azurerm_resource_group" "test_rg" { + name = var.resource_group + location = var.location +} + +resource "random_string" "suffix" { + length = 8 + upper = false + special = false +} + +resource "azurerm_cosmosdb_account" "test" { + name = "rwcosmos${random_string.suffix.result}" + location = azurerm_resource_group.test_rg.location + resource_group_name = azurerm_resource_group.test_rg.name + offer_type = "Standard" + kind = "GlobalDocumentDB" + + consistency_policy { + consistency_level = "Session" + } + + geo_location { + location = azurerm_resource_group.test_rg.location + failover_priority = 0 + } + + tags = var.tags +} diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/terraform/outputs.tf b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/outputs.tf new file mode 100644 index 00000000..7b9d5ad8 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/outputs.tf @@ -0,0 +1,11 @@ +output "cosmosdb_account_name" { + value = azurerm_cosmosdb_account.test.name +} + +output "resource_group_name" { + value = azurerm_resource_group.test_rg.name +} + +output "subscription_hint" { + value = "Use the same subscription ID as Terraform provider context for RunWhen config." +} diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/terraform/providers.tf b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/providers.tf new file mode 100644 index 00000000..f2574812 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/providers.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = "4.18.0" + } + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + } +} + +provider "azurerm" { + features {} +} diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/terraform/terraform.tfvars b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/terraform.tfvars new file mode 100644 index 00000000..ba1aef54 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/terraform.tfvars @@ -0,0 +1,7 @@ +resource_group = "azure-cosmosdb-utilization-health-test" +location = "East US" +tags = { + env = "test" + lifecycle = "deleteme" + product = "runwhen" +} diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/terraform/variables.tf b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/variables.tf new file mode 100644 index 00000000..0a2181a5 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/terraform/variables.tf @@ -0,0 +1,12 @@ +variable "resource_group" { + type = string +} + +variable "location" { + type = string + default = "East US" +} + +variable "tags" { + type = map(string) +} diff --git a/codebundles/azure-cosmosdb-utilization-health/.test/validate-all-tests.sh b/codebundles/azure-cosmosdb-utilization-health/.test/validate-all-tests.sh new file mode 100755 index 00000000..4aeba235 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/.test/validate-all-tests.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail +# Validates shell syntax for bundle scripts (optional shellcheck when installed). +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +echo "Checking scripts under ${ROOT}" +for f in "${ROOT}"/*.sh; do + [[ -f "$f" ]] || continue + bash -n "$f" +done +if command -v shellcheck >/dev/null 2>&1; then + shellcheck "${ROOT}"/*.sh || true +fi +echo "OK: bash syntax check passed for bundle scripts." diff --git a/codebundles/azure-cosmosdb-utilization-health/README.md b/codebundles/azure-cosmosdb-utilization-health/README.md new file mode 100644 index 00000000..89059f16 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/README.md @@ -0,0 +1,68 @@ +# Azure Cosmos DB Utilization and Sizing Health + +This CodeBundle evaluates historical and point-in-time utilization for Azure Cosmos DB using Azure Monitor: normalized RU consumption, total RU, HTTP 429 throttling, server-side latency, storage growth, and throughput sizing signals. It complements configuration-focused bundles (for example `azure-cosmosdb-config-health`) with capacity and cost-oriented metrics. + +## Overview + +- **Normalized RU trends**: Detects sustained high utilization and rising pressure versus the first half of the lookback window. +- **Total RU consumed**: Flags sharp growth in daily `TotalRequestUnits` between halves of the window. +- **Throttling / 429**: Sums `TotalRequests` filtered to status `429` to catch undersizing or hot partitions. +- **Server-side latency**: Compares `ServerSideLatency` hourly averages to a configurable millisecond threshold. +- **Storage**: Tracks `DataUsage` and `IndexUsage` for rapid expansion. +- **Throughput sizing**: Highlights ceiling risk from high normalized RU and possible over-provisioning when normalized RU stays low while `ProvisionedThroughput` remains high. +- **SLI**: A lightweight `sli.robot` averages binary checks (normalized RU, 429 count, latency) into a 0–1 score. + +Metric names follow Microsoft’s supported metrics for `Microsoft.DocumentDB/databaseAccounts` (for example `NormalizedRUConsumption`, `TotalRequestUnits`, `TotalRequests` with `StatusCode`, `ServerSideLatency`, `DataUsage`, `IndexUsage`, `ProvisionedThroughput`). + +## Configuration + +### Required variables + +- `AZ_SUBSCRIPTION`: Azure subscription ID (UUID) used for `az account set` and metric queries. +- `AZURE_RESOURCE_GROUP`: Resource group containing the Cosmos DB account(s). + +### Optional variables + +- `COSMOSDB_ACCOUNT_NAME`: Cosmos DB account name, or `All` to scan every account in the group (default: `All`). +- `METRICS_LOOKBACK_DAYS`: Days of history for runbook tasks (default: `14`). +- `NORMALIZED_RU_THRESHOLD_PCT`: Normalized RU percentage that triggers utilization and sizing issues (default: `80`). +- `THROTTLE_EVENTS_THRESHOLD`: Minimum total HTTP 429 count in the window to raise throttling issues (default: `1`). +- `SERVER_LATENCY_MS_THRESHOLD`: Maximum acceptable hourly average `ServerSideLatency` in ms (default: `100`). +- `STORAGE_GROWTH_PCT_THRESHOLD`: Percent growth from start to end of the window on `DataUsage` / `IndexUsage` that flags storage expansion (default: `25`). +- `UNDERUTILIZED_NORMALIZED_PCT`: Normalized RU level used with `ProvisionedThroughput` to suggest over-provisioning (default: `15`). +- `RU_DAILY_GROWTH_RATIO`: Ratio of later-window to earlier-window average daily total RU for spike detection (default: `1.5`). +- `AZURE_SUBSCRIPTION_NAME`: Friendly subscription label for context in reports (default: `Azure Subscription`). + +### SLI-only variables + +- `SLI_METRICS_OFFSET`: Short lookback for the SLI snapshot (default: `2d`), for example `2d` or `24h`. + +### Secrets + +- `azure_credentials`: JSON or structured secret consumed by the RunWhen Azure integration (typically `CLIENT_ID`, `TENANT_ID`, `CLIENT_SECRET`, `SUBSCRIPTION_ID` / `AZURE_SUBSCRIPTION_ID`). If absent, ambient `az login` / workload identity is assumed. + +## Tasks overview + +### Analyze Cosmos DB Normalized RU Consumption Trends + +Uses `NormalizedRUConsumption` to detect sustained values above the threshold and upward trends correlated with the second half of the window. + +### Analyze Cosmos DB Total Request Units Consumed + +Uses daily `TotalRequestUnits` totals to detect a sharp increase between the first and second half of the lookback period. + +### Check Cosmos DB Throttling and HTTP 429 Rate + +Queries `TotalRequests` with dimension filter `StatusCode eq '429'` and compares the aggregate to `THROTTLE_EVENTS_THRESHOLD`. + +### Analyze Cosmos DB Server-side Latency + +Evaluates `ServerSideLatency` hourly averages against `SERVER_LATENCY_MS_THRESHOLD`. + +### Analyze Cosmos DB Data and Index Storage Utilization + +Measures relative growth on `DataUsage` and `IndexUsage` against `STORAGE_GROWTH_PCT_THRESHOLD`. + +### Analyze Cosmos DB Provisioned Throughput vs Consumed Load + +Combines `NormalizedRUConsumption` with `ProvisionedThroughput` for ceiling and over-provisioning hints. diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb-normalized-ru-trends.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-normalized-ru-trends.sh new file mode 100755 index 00000000..c6d18911 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-normalized-ru-trends.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Normalized RU time-series: sustained high utilization and upward trend vs first half of window. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=cosmosdb_metrics_lib.sh +source "${SCRIPT_DIR}/cosmosdb_metrics_lib.sh" + +: "${AZURE_RESOURCE_GROUP:?Must set AZURE_RESOURCE_GROUP}" + +OUTPUT_JSON="cosmosdb_normalized_ru_issues.json" +issues_json='[]' +METRICS_LOOKBACK_DAYS="${METRICS_LOOKBACK_DAYS:-14}" +METRICS_OFFSET="${METRICS_OFFSET:-${METRICS_LOOKBACK_DAYS}d}" +THRESH="${NORMALIZED_RU_THRESHOLD_PCT:-80}" +COSMOS_FILTER="${COSMOSDB_ACCOUNT_NAME:-All}" + +sub="$(cosmosdb_resolve_subscription)" +if [[ -z "$sub" ]]; then + echo "[]" > "$OUTPUT_JSON" + echo "No subscription context; wrote empty issues." + exit 0 +fi + +az account set --subscription "$sub" + +while IFS= read -r acct; do + [[ -z "$acct" ]] && continue + rid="$(cosmosdb_resource_id "$sub" "$AZURE_RESOURCE_GROUP" "$acct")" + [[ -z "$rid" ]] && continue + + raw="$(az monitor metrics list --resource "$rid" --metric NormalizedRUConsumption \ + --offset "$METRICS_OFFSET" --interval PT1H --aggregation Average Maximum \ + --output json 2>/dev/null || true)" + [[ -z "$raw" || "$raw" == "{}" ]] && continue + + avgs="$(echo "$raw" | jq '[.value[0].timeseries[]?.data[]? | select(.average != null) | .average]')" + len="$(echo "$avgs" | jq 'length')" + if [[ "$len" -lt 2 ]]; then + continue + fi + + high_cnt="$(echo "$avgs" | jq --argjson t "$THRESH" '[.[] | select(. > $t)] | length')" + high_frac="$(echo "$avgs" | jq --argjson hc "$high_cnt" --argjson l "$len" '($hc / $l)')" + + half=$((len / 2)) + first_avg="$(echo "$avgs" | jq --argjson h "$half" '.[0:$h] | add / length')" + second_avg="$(echo "$avgs" | jq --argjson h "$half" '.[$h:] | add / length')" + trend_up="$(echo "$second_avg $first_avg" | awk '{if ($2 > 0 && ($1 / $2) > 1.15) print 1; else print 0}')" + + if awk -v f="$high_frac" 'BEGIN{exit !(f+0 > 0.5)}'; then + issues_json="$(jq --arg t "Sustained high Normalized RU for Cosmos DB \`$acct\`" \ + --arg d "More than half of hourly samples exceed ${THRESH}% normalized RU (fraction high: ${high_frac})." \ + --arg n "Review hot partitions, partition keys, and provisioned throughput or autoscale max for account \`$acct\`. Consider Azure Advisor and Metrics explorer with DatabaseName/CollectionName dimensions." \ + '. += [{title:$t,details:$d,severity:3,next_steps:$n}]' <<<"$issues_json")" + fi + + if [[ "$trend_up" == "1" ]] && awk -v s="$second_avg" -v t="$THRESH" 'BEGIN{exit !(s+0 > (t * 0.6))}'; then + issues_json="$(jq --arg t "Rising Normalized RU pressure for Cosmos DB \`$acct\`" \ + --arg d "Second-half average normalized RU (~${second_avg}%) is materially higher than first-half (~${first_avg}%) while remaining elevated." \ + --arg n "Investigate workload growth, indexing changes, and cross-partition queries for \`$acct\`. Plan throughput increases before throttling spreads." \ + '. += [{title:$t,details:$d,severity:3,next_steps:$n}]' <<<"$issues_json")" + fi + +done < <(cosmosdb_account_names "$sub" "$AZURE_RESOURCE_GROUP" "$COSMOS_FILTER") + +echo "$issues_json" > "$OUTPUT_JSON" +echo "Wrote $OUTPUT_JSON" diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb-server-latency.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-server-latency.sh new file mode 100755 index 00000000..93da46da --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-server-latency.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Server-side latency regression vs threshold (Average aggregation). +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=cosmosdb_metrics_lib.sh +source "${SCRIPT_DIR}/cosmosdb_metrics_lib.sh" + +: "${AZURE_RESOURCE_GROUP:?Must set AZURE_RESOURCE_GROUP}" + +OUTPUT_JSON="cosmosdb_latency_issues.json" +issues_json='[]' +METRICS_LOOKBACK_DAYS="${METRICS_LOOKBACK_DAYS:-14}" +METRICS_OFFSET="${METRICS_OFFSET:-${METRICS_LOOKBACK_DAYS}d}" +LAT_MS="${SERVER_LATENCY_MS_THRESHOLD:-100}" +COSMOS_FILTER="${COSMOSDB_ACCOUNT_NAME:-All}" + +sub="$(cosmosdb_resolve_subscription)" +if [[ -z "$sub" ]]; then + echo "[]" > "$OUTPUT_JSON" + exit 0 +fi + +az account set --subscription "$sub" + +while IFS= read -r acct; do + [[ -z "$acct" ]] && continue + rid="$(cosmosdb_resource_id "$sub" "$AZURE_RESOURCE_GROUP" "$acct")" + [[ -z "$rid" ]] && continue + + raw="$(az monitor metrics list --resource "$rid" --metric ServerSideLatency \ + --offset "$METRICS_OFFSET" --interval PT1H --aggregation Average Maximum \ + --output json 2>/dev/null || true)" + [[ -z "$raw" || "$raw" == "{}" ]] && continue + + max_avg="$(echo "$raw" | jq -r '[.value[0].timeseries[]?.data[]? | select(.average != null) | .average] | max // empty')" + [[ -z "$max_avg" || "$max_avg" == "null" ]] && continue + + if awk -v m="$max_avg" -v t="$LAT_MS" 'BEGIN{exit !(m > t)}'; then + issues_json="$(jq --arg t "Elevated server-side latency for Cosmos DB \`$acct\`" \ + --arg d "Peak hourly average ServerSideLatency ~${max_avg} ms exceeds threshold ${LAT_MS} ms in the analysis window." \ + --arg n "Correlate with normalized RU, throttling, and query patterns. Tune indexing, partition spread, and SDK consistency level for \`$acct\`." \ + '. += [{title:$t,details:$d,severity:3,next_steps:$n}]' <<<"$issues_json")" + fi + +done < <(cosmosdb_account_names "$sub" "$AZURE_RESOURCE_GROUP" "$COSMOS_FILTER") + +echo "$issues_json" > "$OUTPUT_JSON" +echo "Wrote $OUTPUT_JSON" diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb-storage-utilization.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-storage-utilization.sh new file mode 100755 index 00000000..1cfe8e49 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-storage-utilization.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# DataUsage + IndexUsage growth across the lookback window. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=cosmosdb_metrics_lib.sh +source "${SCRIPT_DIR}/cosmosdb_metrics_lib.sh" + +: "${AZURE_RESOURCE_GROUP:?Must set AZURE_RESOURCE_GROUP}" + +OUTPUT_JSON="cosmosdb_storage_issues.json" +issues_json='[]' +METRICS_LOOKBACK_DAYS="${METRICS_LOOKBACK_DAYS:-14}" +METRICS_OFFSET="${METRICS_OFFSET:-${METRICS_LOOKBACK_DAYS}d}" +GROWTH_PCT="${STORAGE_GROWTH_PCT_THRESHOLD:-25}" +COSMOS_FILTER="${COSMOSDB_ACCOUNT_NAME:-All}" + +sub="$(cosmosdb_resolve_subscription)" +if [[ -z "$sub" ]]; then + echo "[]" > "$OUTPUT_JSON" + exit 0 +fi + +az account set --subscription "$sub" + +storage_growth_issue() { + local acct="$1" metric="$2" json="$3" + local series first last pct + series="$(echo "$json" | jq '[.value[0].timeseries[0].data[]? | (.average // .maximum // 0)] | map(select(. > 0))')" + local n + n="$(echo "$series" | jq 'length')" + [[ "$n" -lt 2 ]] && return 0 + first="$(echo "$series" | jq '.[0]')" + last="$(echo "$series" | jq '.[-1]')" + [[ "$first" == "0" ]] && return 0 + pct="$(echo "$series" | awk -v f="$first" -v l="$last" 'BEGIN{printf "%.2f", (l-f)/f*100}')" + if awk -v p="$pct" -v g="$GROWTH_PCT" 'BEGIN{exit !(p > g)}'; then + issues_json="$(jq --arg t "Rapid ${metric} growth for Cosmos DB \`$acct\`" \ + --arg d "${metric} grew ~${pct}% from the start to the end of the ${METRICS_LOOKBACK_DAYS}d window (threshold ${GROWTH_PCT}%)." \ + --arg n "Plan partition count, indexing cost, and storage billing impacts for \`$acct\`. Review TTL, archival, and analytical store if applicable." \ + '. += [{title:$t,details:$d,severity:3,next_steps:$n}]' <<<"$issues_json")" + fi +} + +while IFS= read -r acct; do + [[ -z "$acct" ]] && continue + rid="$(cosmosdb_resource_id "$sub" "$AZURE_RESOURCE_GROUP" "$acct")" + [[ -z "$rid" ]] && continue + + for metric in DataUsage IndexUsage; do + raw="$(az monitor metrics list --resource "$rid" --metric "$metric" \ + --offset "$METRICS_OFFSET" --interval P1D --aggregation Average Maximum \ + --output json 2>/dev/null || true)" + [[ -z "$raw" || "$raw" == "{}" ]] && continue + storage_growth_issue "$acct" "$metric" "$raw" + done + +done < <(cosmosdb_account_names "$sub" "$AZURE_RESOURCE_GROUP" "$COSMOS_FILTER") + +echo "$issues_json" > "$OUTPUT_JSON" +echo "Wrote $OUTPUT_JSON" diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb-throttling-429.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-throttling-429.sh new file mode 100755 index 00000000..453cc37a --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-throttling-429.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# HTTP 429 / throttled requests via TotalRequests with StatusCode dimension. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=cosmosdb_metrics_lib.sh +source "${SCRIPT_DIR}/cosmosdb_metrics_lib.sh" + +: "${AZURE_RESOURCE_GROUP:?Must set AZURE_RESOURCE_GROUP}" + +OUTPUT_JSON="cosmosdb_throttle_issues.json" +issues_json='[]' +METRICS_LOOKBACK_DAYS="${METRICS_LOOKBACK_DAYS:-14}" +METRICS_OFFSET="${METRICS_OFFSET:-${METRICS_LOOKBACK_DAYS}d}" +THROTTLE_MIN="${THROTTLE_EVENTS_THRESHOLD:-1}" +COSMOS_FILTER="${COSMOSDB_ACCOUNT_NAME:-All}" + +sub="$(cosmosdb_resolve_subscription)" +if [[ -z "$sub" ]]; then + echo "[]" > "$OUTPUT_JSON" + exit 0 +fi + +az account set --subscription "$sub" + +while IFS= read -r acct; do + [[ -z "$acct" ]] && continue + rid="$(cosmosdb_resource_id "$sub" "$AZURE_RESOURCE_GROUP" "$acct")" + [[ -z "$rid" ]] && continue + + raw="$(az monitor metrics list --resource "$rid" --metric TotalRequests \ + --dimension StatusCode --filter "StatusCode eq '429'" \ + --offset "$METRICS_OFFSET" --interval PT1H --aggregation Total \ + --output json 2>/dev/null || true)" + [[ -z "$raw" || "$raw" == "{}" ]] && continue + + total_429="$(echo "$raw" | jq '[.value[0].timeseries[]?.data[]? | (.total // 0)] | add // 0')" + + if awk -v t="$total_429" -v m="$THROTTLE_MIN" 'BEGIN{exit !(t >= m)}'; then + issues_json="$(jq --arg t "HTTP 429 throttling observed for Cosmos DB \`$acct\`" \ + --arg d "TotalRequests with status 429 in the lookback window: ${total_429} (threshold: ${THROTTLE_MIN})." \ + --arg n "Increase provisioned RU/s or autoscale max, reduce RU-heavy queries, fix hot partitions, or enable retry policies with backoff for \`$acct\`." \ + '. += [{title:$t,details:$d,severity:4,next_steps:$n}]' <<<"$issues_json")" + fi + +done < <(cosmosdb_account_names "$sub" "$AZURE_RESOURCE_GROUP" "$COSMOS_FILTER") + +echo "$issues_json" > "$OUTPUT_JSON" +echo "Wrote $OUTPUT_JSON" diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb-throughput-sizing.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-throughput-sizing.sh new file mode 100755 index 00000000..20ca7adc --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-throughput-sizing.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Provisioned / autoscale headroom vs normalized utilization (oversizing and undersizing hints). +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=cosmosdb_metrics_lib.sh +source "${SCRIPT_DIR}/cosmosdb_metrics_lib.sh" + +: "${AZURE_RESOURCE_GROUP:?Must set AZURE_RESOURCE_GROUP}" + +OUTPUT_JSON="cosmosdb_throughput_sizing_issues.json" +issues_json='[]' +METRICS_LOOKBACK_DAYS="${METRICS_LOOKBACK_DAYS:-14}" +METRICS_OFFSET="${METRICS_OFFSET:-${METRICS_LOOKBACK_DAYS}d}" +HIGH_NRU="${NORMALIZED_RU_THRESHOLD_PCT:-80}" +LOW_NRU="${UNDERUTILIZED_NORMALIZED_PCT:-15}" +COSMOS_FILTER="${COSMOSDB_ACCOUNT_NAME:-All}" + +sub="$(cosmosdb_resolve_subscription)" +if [[ -z "$sub" ]]; then + echo "[]" > "$OUTPUT_JSON" + exit 0 +fi + +az account set --subscription "$sub" + +while IFS= read -r acct; do + [[ -z "$acct" ]] && continue + rid="$(cosmosdb_resource_id "$sub" "$AZURE_RESOURCE_GROUP" "$acct")" + [[ -z "$rid" ]] && continue + + nru_raw="$(az monitor metrics list --resource "$rid" --metric NormalizedRUConsumption \ + --offset "$METRICS_OFFSET" --interval PT1H --aggregation Average \ + --output json 2>/dev/null || true)" + [[ -z "$nru_raw" || "$nru_raw" == "{}" ]] && continue + + avgs="$(echo "$nru_raw" | jq '[.value[0].timeseries[]?.data[]? | select(.average != null) | .average]')" + len="$(echo "$avgs" | jq 'length')" + [[ "$len" -lt 2 ]] && continue + + max_nru="$(echo "$avgs" | jq 'max')" + low_cnt="$(echo "$avgs" | jq --argjson l "$LOW_NRU" '[.[] | select(. < $l)] | length')" + low_frac="$(echo "$avgs" | jq --argjson c "$low_cnt" --argjson le "$len" '($c / $le)')" + + if awk -v m="$max_nru" -v h="$HIGH_NRU" 'BEGIN{exit !(m > h)}'; then + issues_json="$(jq --arg t "Cosmos DB \`$acct\` normalized RU near provisioned ceiling" \ + --arg d "Peak hourly average normalized RU ~${max_nru}% exceeds ${HIGH_NRU}% threshold." \ + --arg n "Increase throughput, raise autoscale maximum, or reduce per-request RU cost for \`$acct\` before customer-visible throttling." \ + '. += [{title:$t,details:$d,severity:3,next_steps:$n}]' <<<"$issues_json")" + fi + + prov_raw="$(az monitor metrics list --resource "$rid" --metric ProvisionedThroughput \ + --offset "$METRICS_OFFSET" --interval P1D --aggregation Average \ + --output json 2>/dev/null || true)" + prov_avg="$(echo "$prov_raw" | jq -r '[.value[0].timeseries[]?.data[]? | select(.average != null) | .average] | max // empty')" + + if awk -v f="$low_frac" 'BEGIN{exit !(f+0 > 0.8)}' && \ + awk -v m="$max_nru" -v l="$LOW_NRU" 'BEGIN{exit !(m+0 < l+5)}' && \ + [[ -n "$prov_avg" && "$prov_avg" != "null" ]] && \ + awk -v p="$prov_avg" 'BEGIN{exit !(p > 400)}'; then + issues_json="$(jq --arg t "Cosmos DB \`$acct\` may be over-provisioned" \ + --arg d "Normalized RU stayed below ~${LOW_NRU}% for most samples (low fraction ${low_frac}) while ProvisionedThroughput remains elevated (recent sample ~${prov_avg} RU/s)." \ + --arg n "Consider lowering manual throughput, tightening autoscale rules, or consolidating databases to reduce cost for \`$acct\` after validating workload baselines." \ + '. += [{title:$t,details:$d,severity:3,next_steps:$n}]' <<<"$issues_json")" + fi + +done < <(cosmosdb_account_names "$sub" "$AZURE_RESOURCE_GROUP" "$COSMOS_FILTER") + +echo "$issues_json" > "$OUTPUT_JSON" +echo "Wrote $OUTPUT_JSON" diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb-total-ru-consumed.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-total-ru-consumed.sh new file mode 100755 index 00000000..d6b93ff4 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb-total-ru-consumed.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Total Request Units consumed: detect sharp growth in daily totals (chargeback / workload spike signal). +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=cosmosdb_metrics_lib.sh +source "${SCRIPT_DIR}/cosmosdb_metrics_lib.sh" + +: "${AZURE_RESOURCE_GROUP:?Must set AZURE_RESOURCE_GROUP}" + +OUTPUT_JSON="cosmosdb_total_ru_issues.json" +issues_json='[]' +METRICS_LOOKBACK_DAYS="${METRICS_LOOKBACK_DAYS:-14}" +METRICS_OFFSET="${METRICS_OFFSET:-${METRICS_LOOKBACK_DAYS}d}" +GROWTH_RATIO="${RU_DAILY_GROWTH_RATIO:-1.5}" +COSMOS_FILTER="${COSMOSDB_ACCOUNT_NAME:-All}" + +sub="$(cosmosdb_resolve_subscription)" +if [[ -z "$sub" ]]; then + echo "[]" > "$OUTPUT_JSON" + exit 0 +fi + +az account set --subscription "$sub" + +while IFS= read -r acct; do + [[ -z "$acct" ]] && continue + rid="$(cosmosdb_resource_id "$sub" "$AZURE_RESOURCE_GROUP" "$acct")" + [[ -z "$rid" ]] && continue + + raw="$(az monitor metrics list --resource "$rid" --metric TotalRequestUnits \ + --offset "$METRICS_OFFSET" --interval P1D --aggregation Total \ + --output json 2>/dev/null || true)" + [[ -z "$raw" || "$raw" == "{}" ]] && continue + + daily="$(echo "$raw" | jq '[.value[0].timeseries[0].data[]? | (.total // 0)]')" + n="$(echo "$daily" | jq 'length')" + if [[ "$n" -lt 4 ]]; then + continue + fi + + mid=$((n / 2)) + first_half_avg="$(echo "$daily" | jq --argjson m "$mid" '.[0:$m] | add / length')" + second_half_avg="$(echo "$daily" | jq --argjson m "$mid" '.[$m:] | add / length')" + + if awk -v a="$second_half_avg" -v b="$first_half_avg" -v r="$GROWTH_RATIO" 'BEGIN{exit !(b > 0 && a > b * r)}'; then + issues_json="$(jq --arg t "Sharp increase in Total RU consumed for Cosmos DB \`$acct\`" \ + --arg d "Daily TotalRequestUnits average in the later window (~${second_half_avg}) exceeds ${GROWTH_RATIO}x the earlier window (~${first_half_avg}) over ${METRICS_LOOKBACK_DAYS}d." \ + --arg n "Validate whether traffic, batch jobs, or indexing drove RU growth for \`$acct\`. Update capacity, autoscale max, or partition strategy before sustained throttling." \ + '. += [{title:$t,details:$d,severity:3,next_steps:$n}]' <<<"$issues_json")" + fi + +done < <(cosmosdb_account_names "$sub" "$AZURE_RESOURCE_GROUP" "$COSMOS_FILTER") + +echo "$issues_json" > "$OUTPUT_JSON" +echo "Wrote $OUTPUT_JSON" diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb_metrics_lib.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb_metrics_lib.sh new file mode 100755 index 00000000..601071c4 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb_metrics_lib.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Shared helpers for Cosmos DB utilization scripts (source from task scripts). + +cosmosdb_resolve_subscription() { + if [[ -n "${AZ_SUBSCRIPTION:-}" ]]; then + printf '%s' "$AZ_SUBSCRIPTION" + elif [[ -n "${AZURE_SUBSCRIPTION_ID:-}" ]]; then + printf '%s' "$AZURE_SUBSCRIPTION_ID" + else + az account show --query id -o tsv 2>/dev/null || true + fi +} + +cosmosdb_account_names() { + local sub="$1" rg="$2" filt="$3" + if [[ -z "$filt" || "${filt,,}" == "all" ]]; then + az cosmosdb list -g "$rg" --subscription "$sub" --query '[].name' -o tsv 2>/dev/null || true + else + printf '%s\n' "$filt" + fi +} + +cosmosdb_resource_id() { + local sub="$1" rg="$2" name="$3" + az cosmosdb show -n "$name" -g "$rg" --subscription "$sub" --query id -o tsv 2>/dev/null || true +} diff --git a/codebundles/azure-cosmosdb-utilization-health/cosmosdb_sli_snapshot.sh b/codebundles/azure-cosmosdb-utilization-health/cosmosdb_sli_snapshot.sh new file mode 100755 index 00000000..077d95aa --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/cosmosdb_sli_snapshot.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Lightweight metric snapshot for SLI (short window). +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=cosmosdb_metrics_lib.sh +source "${SCRIPT_DIR}/cosmosdb_metrics_lib.sh" + +: "${AZURE_RESOURCE_GROUP:?Must set AZURE_RESOURCE_GROUP}" + +OUT="cosmosdb_sli_output.json" +SLI_OFFSET="${SLI_METRICS_OFFSET:-2d}" +COSMOS_FILTER="${COSMOSDB_ACCOUNT_NAME:-All}" +THRESH="${NORMALIZED_RU_THRESHOLD_PCT:-80}" +THROTTLE_MIN="${THROTTLE_EVENTS_THRESHOLD:-1}" +LAT_MS="${SERVER_LATENCY_MS_THRESHOLD:-100}" + +sub="$(cosmosdb_resolve_subscription)" +normalized_ok=1 +throttle_ok=1 +latency_ok=1 + +if [[ -n "$sub" ]]; then + az account set --subscription "$sub" + while IFS= read -r acct; do + [[ -z "$acct" ]] && continue + rid="$(cosmosdb_resource_id "$sub" "$AZURE_RESOURCE_GROUP" "$acct")" + [[ -z "$rid" ]] && continue + + nru_raw="$(az monitor metrics list --resource "$rid" --metric NormalizedRUConsumption \ + --offset "$SLI_OFFSET" --interval PT1H --aggregation Average \ + --output json 2>/dev/null || true)" + max_nru="$(echo "$nru_raw" | jq -r '[.value[0].timeseries[]?.data[]? | select(.average != null) | .average] | max // empty')" + if [[ -n "$max_nru" && "$max_nru" != "null" ]]; then + if awk -v m="$max_nru" -v t="$THRESH" 'BEGIN{exit !(m > t)}'; then + normalized_ok=0 + fi + fi + + raw429="$(az monitor metrics list --resource "$rid" --metric TotalRequests \ + --dimension StatusCode --filter "StatusCode eq '429'" \ + --offset "$SLI_OFFSET" --interval PT1H --aggregation Total \ + --output json 2>/dev/null || true)" + tot="$(echo "$raw429" | jq '[.value[0].timeseries[]?.data[]? | (.total // 0)] | add // 0')" + if awk -v t="$tot" -v m="$THROTTLE_MIN" 'BEGIN{exit !(t >= m)}'; then + throttle_ok=0 + fi + + lat_raw="$(az monitor metrics list --resource "$rid" --metric ServerSideLatency \ + --offset "$SLI_OFFSET" --interval PT1H --aggregation Average \ + --output json 2>/dev/null || true)" + max_lat="$(echo "$lat_raw" | jq -r '[.value[0].timeseries[]?.data[]? | select(.average != null) | .average] | max // empty')" + if [[ -n "$max_lat" && "$max_lat" != "null" ]]; then + if awk -v m="$max_lat" -v t="$LAT_MS" 'BEGIN{exit !(m > t)}'; then + latency_ok=0 + fi + fi + done < <(cosmosdb_account_names "$sub" "$AZURE_RESOURCE_GROUP" "$COSMOS_FILTER") +fi + +jq -n \ + --argjson n "$normalized_ok" \ + --argjson th "$throttle_ok" \ + --argjson l "$latency_ok" \ + '{normalized_ru_ok:$n, throttle_ok:$th, latency_ok:$l}' > "$OUT" +echo "Wrote $OUT" diff --git a/codebundles/azure-cosmosdb-utilization-health/runbook.robot b/codebundles/azure-cosmosdb-utilization-health/runbook.robot new file mode 100644 index 00000000..2096d730 --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/runbook.robot @@ -0,0 +1,304 @@ +*** Settings *** +Documentation Evaluates Azure Cosmos DB utilization via normalized RU, total RU, throttling, latency, storage growth, and throughput sizing to support capacity planning. +Metadata Author rw-codebundle-agent +Metadata Display Name Azure Cosmos DB Utilization and Sizing Health +Metadata Supports Azure CosmosDB Utilization Metrics Health +Force Tags Azure CosmosDB Utilization Health + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Analyze Cosmos DB Normalized RU Consumption Trends for Account `${COSMOSDB_ACCOUNT_NAME}` in Resource Group `${AZURE_RESOURCE_GROUP}` + [Documentation] Pulls Azure Monitor time series for normalized RU consumption to detect sustained pressure or rising trends versus the first half of the lookback window. + [Tags] Azure CosmosDB Metrics access:read-only data:metrics + ${result}= RW.CLI.Run Bash File + ... bash_file=cosmosdb-normalized-ru-trends.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./cosmosdb-normalized-ru-trends.sh + RW.Core.Add Pre To Report ${result.stdout} + ${issues}= RW.CLI.Run Cli + ... cmd=cat cosmosdb_normalized_ru_issues.json + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Normalized RU consumption should remain below ${NORMALIZED_RU_THRESHOLD_PCT}% for most of the window without a sustained upward trend for account `${COSMOSDB_ACCOUNT_NAME}`. + ... actual=Elevated or rising normalized RU consumption was detected for the scoped Cosmos DB account(s). + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Analyze Cosmos DB Total Request Units Consumed for Account `${COSMOSDB_ACCOUNT_NAME}` in Resource Group `${AZURE_RESOURCE_GROUP}` + [Documentation] Aggregates Total Request Units over the lookback window and flags sharp growth between the first and second half of the window. + [Tags] Azure CosmosDB Metrics access:read-only data:metrics + ${result}= RW.CLI.Run Bash File + ... bash_file=cosmosdb-total-ru-consumed.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./cosmosdb-total-ru-consumed.sh + RW.Core.Add Pre To Report ${result.stdout} + ${issues}= RW.CLI.Run Cli + ... cmd=cat cosmosdb_total_ru_issues.json + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Total RU consumption growth should stay within expected baselines for account `${COSMOSDB_ACCOUNT_NAME}`. + ... actual=A sharp increase in daily TotalRequestUnits was detected across the analysis window. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Check Cosmos DB Throttling and HTTP 429 Rate for Account `${COSMOSDB_ACCOUNT_NAME}` in Resource Group `${AZURE_RESOURCE_GROUP}` + [Documentation] Correlates TotalRequests with HTTP 429 status against provisioned capacity to flag undersizing or hot-key effects. + [Tags] Azure CosmosDB Throttling access:read-only data:metrics + ${result}= RW.CLI.Run Bash File + ... bash_file=cosmosdb-throttling-429.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./cosmosdb-throttling-429.sh + RW.Core.Add Pre To Report ${result.stdout} + ${issues}= RW.CLI.Run Cli + ... cmd=cat cosmosdb_throttle_issues.json + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=429 responses should stay below ${THROTTLE_EVENTS_THRESHOLD} in the lookback window for account `${COSMOSDB_ACCOUNT_NAME}`. + ... actual=Throttling (HTTP 429) was observed in Azure Monitor TotalRequests metrics. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Analyze Cosmos DB Server-side Latency for Account `${COSMOSDB_ACCOUNT_NAME}` in Resource Group `${AZURE_RESOURCE_GROUP}` + [Documentation] Reviews ServerSideLatency averages for regressions that often precede saturation or hot partitions. + [Tags] Azure CosmosDB Latency access:read-only data:metrics + ${result}= RW.CLI.Run Bash File + ... bash_file=cosmosdb-server-latency.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./cosmosdb-server-latency.sh + RW.Core.Add Pre To Report ${result.stdout} + ${issues}= RW.CLI.Run Cli + ... cmd=cat cosmosdb_latency_issues.json + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Server-side latency should remain below ${SERVER_LATENCY_MS_THRESHOLD} ms (hourly average) for account `${COSMOSDB_ACCOUNT_NAME}`. + ... actual=Elevated ServerSideLatency was observed in the lookback window. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Analyze Cosmos DB Data and Index Storage Utilization for Account `${COSMOSDB_ACCOUNT_NAME}` in Resource Group `${AZURE_RESOURCE_GROUP}` + [Documentation] Tracks DataUsage and IndexUsage to flag rapid expansion that can drive partition count and cost. + [Tags] Azure CosmosDB Storage access:read-only data:metrics + ${result}= RW.CLI.Run Bash File + ... bash_file=cosmosdb-storage-utilization.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./cosmosdb-storage-utilization.sh + RW.Core.Add Pre To Report ${result.stdout} + ${issues}= RW.CLI.Run Cli + ... cmd=cat cosmosdb_storage_issues.json + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Data and index storage growth should remain within expected bounds (below ~${STORAGE_GROWTH_PCT_THRESHOLD}% swing) for account `${COSMOSDB_ACCOUNT_NAME}`. + ... actual=Rapid storage or index growth was detected across the window. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Analyze Cosmos DB Provisioned Throughput vs Consumed Load for Account `${COSMOSDB_ACCOUNT_NAME}` in Resource Group `${AZURE_RESOURCE_GROUP}` + [Documentation] Compares normalized RU with ProvisionedThroughput signals to highlight undersizing risk or sustained over-provisioning. + [Tags] Azure CosmosDB Throughput access:read-only data:metrics + ${result}= RW.CLI.Run Bash File + ... bash_file=cosmosdb-throughput-sizing.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./cosmosdb-throughput-sizing.sh + RW.Core.Add Pre To Report ${result.stdout} + ${issues}= RW.CLI.Run Cli + ... cmd=cat cosmosdb_throughput_sizing_issues.json + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Throughput should be right-sized: normalized RU below ${NORMALIZED_RU_THRESHOLD_PCT}% for healthy headroom, and not chronically under ~${UNDERUTILIZED_NORMALIZED_PCT}% while heavily provisioned. + ... actual=A throughput sizing concern (ceiling risk or over-provisioning) was detected from metrics. + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + +*** Keywords *** +Suite Initialization + TRY + ${azure_credentials}= RW.Core.Import Secret + ... azure_credentials + ... type=string + ... description=JSON with CLIENT_ID, TENANT_ID, CLIENT_SECRET, SUBSCRIPTION_ID for Azure CLI. + ... pattern=\w* + EXCEPT + Log azure_credentials secret not provided; relying on ambient Azure CLI login. WARN + ${azure_credentials}= Set Variable ${EMPTY} + END + ${AZ_SUBSCRIPTION}= RW.Core.Import User Variable AZ_SUBSCRIPTION + ... type=string + ... description=Azure subscription ID (UUID). + ... pattern=.* + ${AZURE_RESOURCE_GROUP}= RW.Core.Import User Variable AZURE_RESOURCE_GROUP + ... type=string + ... description=Resource group containing Cosmos DB account(s). + ... pattern=.* + ${COSMOSDB_ACCOUNT_NAME}= RW.Core.Import User Variable COSMOSDB_ACCOUNT_NAME + ... type=string + ... description=Cosmos DB account name, or All for every account in the resource group. + ... pattern=.* + ... default=All + ${METRICS_LOOKBACK_DAYS}= RW.Core.Import User Variable METRICS_LOOKBACK_DAYS + ... type=string + ... description=Days of historical metrics for analysis. + ... pattern=^\d+$ + ... default=14 + ${NORMALIZED_RU_THRESHOLD_PCT}= RW.Core.Import User Variable NORMALIZED_RU_THRESHOLD_PCT + ... type=string + ... description=Normalized RU percentage above which to raise utilization issues. + ... pattern=^\d+$ + ... default=80 + ${THROTTLE_EVENTS_THRESHOLD}= RW.Core.Import User Variable THROTTLE_EVENTS_THRESHOLD + ... type=string + ... description=Minimum count of HTTP 429 requests in the window to flag throttling. + ... pattern=^\d+$ + ... default=1 + ${SERVER_LATENCY_MS_THRESHOLD}= RW.Core.Import User Variable SERVER_LATENCY_MS_THRESHOLD + ... type=string + ... description=Maximum acceptable hourly average ServerSideLatency in milliseconds. + ... pattern=^\d+$ + ... default=100 + ${STORAGE_GROWTH_PCT_THRESHOLD}= RW.Core.Import User Variable STORAGE_GROWTH_PCT_THRESHOLD + ... type=string + ... description=Percent growth from start to end of window that triggers storage/index expansion issues. + ... pattern=^\d+$ + ... default=25 + ${UNDERUTILIZED_NORMALIZED_PCT}= RW.Core.Import User Variable UNDERUTILIZED_NORMALIZED_PCT + ... type=string + ... description=Normalized RU level used with provisioned throughput to suggest over-provisioning. + ... pattern=^\d+$ + ... default=15 + ${RU_DAILY_GROWTH_RATIO}= RW.Core.Import User Variable RU_DAILY_GROWTH_RATIO + ... type=string + ... description=Ratio of later-window to earlier-window daily Total RU that indicates a spike. + ... pattern=^[0-9.]+$ + ... default=1.5 + ${AZURE_SUBSCRIPTION_NAME}= RW.Core.Import User Variable AZURE_SUBSCRIPTION_NAME + ... type=string + ... description=Friendly subscription name for reporting context. + ... pattern=.* + ... default=Azure Subscription + ${env}= Create Dictionary + ... AZ_SUBSCRIPTION=${AZ_SUBSCRIPTION} + ... AZURE_SUBSCRIPTION_ID=${AZ_SUBSCRIPTION} + ... AZURE_RESOURCE_GROUP=${AZURE_RESOURCE_GROUP} + ... COSMOSDB_ACCOUNT_NAME=${COSMOSDB_ACCOUNT_NAME} + ... METRICS_LOOKBACK_DAYS=${METRICS_LOOKBACK_DAYS} + ... NORMALIZED_RU_THRESHOLD_PCT=${NORMALIZED_RU_THRESHOLD_PCT} + ... THROTTLE_EVENTS_THRESHOLD=${THROTTLE_EVENTS_THRESHOLD} + ... SERVER_LATENCY_MS_THRESHOLD=${SERVER_LATENCY_MS_THRESHOLD} + ... STORAGE_GROWTH_PCT_THRESHOLD=${STORAGE_GROWTH_PCT_THRESHOLD} + ... UNDERUTILIZED_NORMALIZED_PCT=${UNDERUTILIZED_NORMALIZED_PCT} + ... RU_DAILY_GROWTH_RATIO=${RU_DAILY_GROWTH_RATIO} + Set Suite Variable ${env} ${env} + RW.CLI.Run Cli + ... cmd=az account set --subscription ${AZ_SUBSCRIPTION} + ... include_in_history=false diff --git a/codebundles/azure-cosmosdb-utilization-health/sli.robot b/codebundles/azure-cosmosdb-utilization-health/sli.robot new file mode 100644 index 00000000..7feca94f --- /dev/null +++ b/codebundles/azure-cosmosdb-utilization-health/sli.robot @@ -0,0 +1,112 @@ +*** Settings *** +Documentation Measures Cosmos DB utilization health using normalized RU, HTTP 429 rate, and server-side latency. Produces a value between 0 (failing) and 1 (fully passing). +Metadata Author rw-codebundle-agent +Metadata Display Name Azure Cosmos DB Utilization SLI +Metadata Supports Azure CosmosDB Utilization SLI +Force Tags Azure CosmosDB SLI + +Library BuiltIn +Library Collections +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Collect Cosmos DB SLI Snapshot for Account `${COSMOSDB_ACCOUNT_NAME}` + [Documentation] Runs a short-window Azure Monitor query set for normalized RU, 429 totals, and server latency. + [Tags] Azure CosmosDB access:read-only data:metrics + ${snap}= RW.CLI.Run Bash File + ... bash_file=cosmosdb_sli_snapshot.sh + ... env=${env} + ... timeout_seconds=30 + ... include_in_history=false + ... cmd_override=./cosmosdb_sli_snapshot.sh + +Publish Cosmos DB Utilization Health Score + [Documentation] Averages binary dimension scores into the primary 0-1 health metric. + [Tags] Azure CosmosDB access:read-only data:metrics + ${raw}= RW.CLI.Run Cli + ... cmd=cat cosmosdb_sli_output.json + ... env=${env} + ... timeout_seconds=30 + ... include_in_history=false + TRY + ${data}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log SLI JSON parse failed; emitting zero health score. WARN + RW.Core.Push Metric 0 + RETURN + END + ${s1}= Get From Dictionary ${data} normalized_ru_ok + ${s2}= Get From Dictionary ${data} throttle_ok + ${s3}= Get From Dictionary ${data} latency_ok + RW.Core.Push Metric ${s1} sub_name=normalized_ru + RW.Core.Push Metric ${s2} sub_name=throttle_429 + RW.Core.Push Metric ${s3} sub_name=server_latency + ${health_score}= Evaluate (${s1} + ${s2} + ${s3}) / 3 + ${health_score}= Convert To Number ${health_score} 2 + RW.Core.Add to Report Health Score: ${health_score} + RW.Core.Push Metric ${health_score} + + +*** Keywords *** +Suite Initialization + TRY + ${azure_credentials}= RW.Core.Import Secret + ... azure_credentials + ... type=string + ... description=JSON with CLIENT_ID, TENANT_ID, CLIENT_SECRET, SUBSCRIPTION_ID for Azure CLI. + ... pattern=\w* + EXCEPT + Log azure_credentials secret not provided; relying on ambient Azure CLI login. WARN + ${azure_credentials}= Set Variable ${EMPTY} + END + ${AZ_SUBSCRIPTION}= RW.Core.Import User Variable AZ_SUBSCRIPTION + ... type=string + ... description=Azure subscription ID (UUID). + ... pattern=.* + ${AZURE_RESOURCE_GROUP}= RW.Core.Import User Variable AZURE_RESOURCE_GROUP + ... type=string + ... description=Resource group containing Cosmos DB account(s). + ... pattern=.* + ${COSMOSDB_ACCOUNT_NAME}= RW.Core.Import User Variable COSMOSDB_ACCOUNT_NAME + ... type=string + ... description=Cosmos DB account name, or All for every account in the resource group. + ... pattern=.* + ... default=All + ${NORMALIZED_RU_THRESHOLD_PCT}= RW.Core.Import User Variable NORMALIZED_RU_THRESHOLD_PCT + ... type=string + ... description=Normalized RU percentage threshold for SLI. + ... pattern=^\d+$ + ... default=80 + ${THROTTLE_EVENTS_THRESHOLD}= RW.Core.Import User Variable THROTTLE_EVENTS_THRESHOLD + ... type=string + ... description=429 count threshold for SLI failure. + ... pattern=^\d+$ + ... default=1 + ${SERVER_LATENCY_MS_THRESHOLD}= RW.Core.Import User Variable SERVER_LATENCY_MS_THRESHOLD + ... type=string + ... description=ServerSideLatency ms threshold for SLI failure. + ... pattern=^\d+$ + ... default=100 + ${SLI_METRICS_OFFSET}= RW.Core.Import User Variable SLI_METRICS_OFFSET + ... type=string + ... description=Short lookback for SLI queries (e.g. 2d). + ... pattern=.* + ... default=2d + ${env}= Create Dictionary + ... AZ_SUBSCRIPTION=${AZ_SUBSCRIPTION} + ... AZURE_SUBSCRIPTION_ID=${AZ_SUBSCRIPTION} + ... AZURE_RESOURCE_GROUP=${AZURE_RESOURCE_GROUP} + ... COSMOSDB_ACCOUNT_NAME=${COSMOSDB_ACCOUNT_NAME} + ... NORMALIZED_RU_THRESHOLD_PCT=${NORMALIZED_RU_THRESHOLD_PCT} + ... THROTTLE_EVENTS_THRESHOLD=${THROTTLE_EVENTS_THRESHOLD} + ... SERVER_LATENCY_MS_THRESHOLD=${SERVER_LATENCY_MS_THRESHOLD} + ... SLI_METRICS_OFFSET=${SLI_METRICS_OFFSET} + Set Suite Variable ${env} ${env} + RW.CLI.Run Cli + ... cmd=az account set --subscription ${AZ_SUBSCRIPTION} + ... include_in_history=false