diff --git a/codebundles/aws-sqs-dlq-health/.runwhen/generation-rules/aws-sqs-dlq-health.yaml b/codebundles/aws-sqs-dlq-health/.runwhen/generation-rules/aws-sqs-dlq-health.yaml new file mode 100644 index 00000000..bad81f5c --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/.runwhen/generation-rules/aws-sqs-dlq-health.yaml @@ -0,0 +1,21 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: aws + generationRules: + - resourceTypes: + - aws_sqs_queues + matchRules: + - type: pattern + pattern: ".+" + properties: ["name"] + mode: substring + slxs: + - baseName: aws-sqs-dlq-health + qualifiers: ["account", "region", "resource"] + baseTemplateName: aws-sqs-dlq-health + levelOfDetail: basic + outputItems: + - type: slx + - type: runbook + templateName: aws-sqs-dlq-health-taskset.yaml diff --git a/codebundles/aws-sqs-dlq-health/.runwhen/templates/aws-sqs-dlq-health-slx.yaml b/codebundles/aws-sqs-dlq-health/.runwhen/templates/aws-sqs-dlq-health-slx.yaml new file mode 100644 index 00000000..c3dd9d2b --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/.runwhen/templates/aws-sqs-dlq-health-slx.yaml @@ -0,0 +1,31 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/lambda.png + alias: "{{match_resource.account_name}} SQS DLQ Health — {{match_resource.resource.region}}" + asMeasuredBy: "DLQ message depth, sampled payloads, Lambda log errors, and source queue CloudWatch metrics for matched SQS queues." + configProvided: + - name: SLX_PLACEHOLDER + value: SLX_PLACEHOLDER + owners: + - {{workspace.owner_email}} + statement: Dead-letter queues should stay near zero; sustained DLQ depth indicates consumer or message-level failures that need investigation. + additionalContext: + {% include "aws-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name | replace(":", "_") }}" + tags: + {% include "aws-tags.yaml" ignore missing %} + - name: cloud + value: aws + - name: service + value: sqs + - name: scope + value: resource + - name: access + value: read-only diff --git a/codebundles/aws-sqs-dlq-health/.runwhen/templates/aws-sqs-dlq-health-taskset.yaml b/codebundles/aws-sqs-dlq-health/.runwhen/templates/aws-sqs-dlq-health-taskset.yaml new file mode 100644 index 00000000..d43860ae --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/.runwhen/templates/aws-sqs-dlq-health-taskset.yaml @@ -0,0 +1,42 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Monitors SQS DLQs, samples messages, correlates Lambda logs, and pulls source queue metrics for {{match_resource.account_name}} in {{match_resource.resource.region}}. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/aws-sqs-dlq-health/runbook.robot + configProvided: + - name: AWS_REGION + value: "{{match_resource.resource.region}}" + - name: AWS_ACCOUNT_NAME + value: "{{match_resource.account_name}}" + - name: SQS_QUEUE_URL + value: "{{ match_resource.resource.queue_url | default('') }}" + - name: SQS_QUEUE_URLS + value: "{{ match_resource.resource.queue_url | default('') }}" + - name: SQS_QUEUE_NAME_PREFIX + value: "" + - name: DEAD_LETTER_MESSAGE_THRESHOLD + value: "0" + - name: CLOUDWATCH_LOG_LOOKBACK_MINUTES + value: "30" + - name: MAX_DLQ_MESSAGES_TO_SAMPLE + value: "5" + secretsProvided: + {% include "aws-auth.yaml" ignore missing %} diff --git a/codebundles/aws-sqs-dlq-health/README.md b/codebundles/aws-sqs-dlq-health/README.md new file mode 100644 index 00000000..903f1552 --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/README.md @@ -0,0 +1,54 @@ +# AWS SQS Dead Letter Queue Health and Log Correlation + +This CodeBundle monitors Amazon SQS dead-letter queues (DLQs) tied to source queues via `RedrivePolicy`, raises issues when messages accumulate past a threshold, samples recent DLQ payloads for diagnostics, correlates Lambda event source consumers to CloudWatch Logs, and pulls source-queue CloudWatch metrics for backlog context. + +## Overview + +- **DLQ depth and redrive**: Discovers source queues (explicit URLs or `list-queues` with an optional prefix), resolves each DLQ from `RedrivePolicy`, dedupes checks by DLQ ARN, and compares `ApproximateNumberOfMessages` to `DEAD_LETTER_MESSAGE_THRESHOLD`. +- **DLQ sampling**: Receives up to `MAX_DLQ_MESSAGES_TO_SAMPLE` messages per run with a short visibility timeout, extracts attributes and body snippets, then resets visibility to zero so messages remain available (delete is not used by default). +- **Lambda logs**: Lists Lambda event source mappings for each source queue ARN and searches `/aws/lambda/` log groups for error patterns in the lookback window. If the DLQ has traffic but there is no Lambda mapping, the task reports that non-Lambda consumers (ECS, EC2, etc.) need manual correlation. +- **Source metrics**: Reads `AWS/SQS` metrics (`ApproximateAgeOfOldestMessage`, `NumberOfMessagesSent`, `NumberOfMessagesDeleted`) and flags sustained high oldest-message age. + +SQS does not expose Azure-style `DeadLetterReason` on the queue API; root cause typically comes from message bodies (for example Lambda failure payloads), consumer logs, or application attributes. + +## Configuration + +### Required Variables + +- `AWS_REGION`: AWS region containing the queues. + +### Optional Variables + +- `AWS_ACCOUNT_NAME`: Account display label for reports (default: `Unknown`). +- `SQS_QUEUE_URL`: Optional single source queue URL (often supplied by discovery as a qualifier alongside `SQS_QUEUE_URLS`). +- `SQS_QUEUE_URLS`: Comma-separated source queue URLs; when empty, discovery uses `aws sqs list-queues` with optional `SQS_QUEUE_NAME_PREFIX`. +- `SQS_QUEUE_NAME_PREFIX`: Prefix passed to `list-queues` when no explicit URLs are set (default: empty). +- `DEAD_LETTER_MESSAGE_THRESHOLD`: Open a DLQ depth issue when approximate message count is **greater than** this integer (default: `0`, meaning any message triggers when depth exceeds zero). +- `CLOUDWATCH_LOG_LOOKBACK_MINUTES`: Window for Lambda log search and metric alignment (default: `30`). +- `MAX_DLQ_MESSAGES_TO_SAMPLE`: Cap on DLQ messages to receive per run for diagnostics (default: `5`). + +### Secrets + +- `aws_credentials`: Standard RunWhen AWS credentials (`aws-auth` block): access keys, IRSA, or assume-role via workspace configuration. + +## Tasks Overview + +### Check Dead Letter Queue Depth and Redrive Configuration + +Evaluates DLQ depth against the threshold and surfaces redrive metadata (`maxReceiveCount`). Emits issues when depth is above the configured limit. + +### Sample Recent Dead Letter Messages for Diagnostics + +Pulls a bounded sample per run, returns visibility to zero after inspection, and emits structured issues containing message metadata and truncated bodies (including Lambda async failure shapes when present). + +### Correlate DLQ to Lambda Consumer CloudWatch Logs + +Finds Lambda functions subscribed to the source queue and scans recent log events for error-oriented patterns. If messages are in the DLQ but the consumer is not Lambda, the task explains the limitation and points operators to other platforms. + +### Collect Source Queue CloudWatch Metrics for Context + +Emits CloudWatch metric datapoints to the report and opens issues when `ApproximateAgeOfOldestMessage` stays above 300 seconds in the window (configurable only in script today; threshold is documented in task output). + +## IAM + +Typical read-only permissions: `sqs:GetQueueAttributes`, `sqs:ListQueues`, `sqs:ReceiveMessage` (DLQ), `sqs:GetQueueUrl`, `lambda:ListEventSourceMappings`, `logs:FilterLogEvents`, `logs:DescribeLogGroups`, `cloudwatch:GetMetricStatistics`. diff --git a/codebundles/aws-sqs-dlq-health/auth.sh b/codebundles/aws-sqs-dlq-health/auth.sh new file mode 100755 index 00000000..f6c59133 --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/auth.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# AWS auth helper aligned with runwhen-local aws-auth template and runtime aws_utils. +# Supports: explicit keys, assume role, IRSA, pod identity, default credential chain. +# Credentials are injected by the platform from the aws-auth block; this script +# optionally assumes a role when AWS_ROLE_ARN is set with base credentials, then +# verifies authentication. +# +# Usage: source this file, then call `auth` +# source "$(dirname "$0")/auth.sh" +# auth + +_aws_verify() { + aws sts get-caller-identity --output json >/dev/null 2>&1 +} + +auth() { + # IRSA or EKS Pod Identity: no access keys required; runtime sets AWS_WEB_IDENTITY_TOKEN_FILE + # or AWS_CONTAINER_CREDENTIALS_FULL_URI. Just verify. + if [[ -n "${AWS_WEB_IDENTITY_TOKEN_FILE:-}" ]] || [[ -n "${AWS_CONTAINER_CREDENTIALS_FULL_URI:-}" ]]; then + if _aws_verify; then + return 0 + fi + echo "AWS identity (IRSA/pod identity) present but get-caller-identity failed." + exit 1 + fi + + # Explicit credentials: if AWS_ROLE_ARN is set with base creds, assume the role + if [[ -n "${AWS_ACCESS_KEY_ID:-}" && -n "${AWS_SECRET_ACCESS_KEY:-}" ]]; then + if [[ -n "${AWS_ROLE_ARN:-}" ]]; then + sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession" --output json) + AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId') + AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey') + AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken') + export AWS_ACCESS_KEY_ID + export AWS_SECRET_ACCESS_KEY + export AWS_SESSION_TOKEN + fi + if _aws_verify; then + return 0 + fi + echo "AWS credentials set but get-caller-identity failed." + exit 1 + fi + + # Default credential chain (env, profile, instance metadata, etc.) + if _aws_verify; then + return 0 + fi + + echo "AWS credentials not configured. Set credentials via the platform aws-auth block (e.g. aws:access_key@cli, aws:irsa@cli, aws:default@cli)." + exit 1 +} diff --git a/codebundles/aws-sqs-dlq-health/runbook.robot b/codebundles/aws-sqs-dlq-health/runbook.robot new file mode 100644 index 00000000..d8b2f698 --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/runbook.robot @@ -0,0 +1,246 @@ +*** Settings *** +Documentation Monitors Amazon SQS dead-letter queues, raises issues when messages accumulate, samples DLQ messages for diagnostics, and correlates Lambda consumer CloudWatch logs for failures. +Metadata Author rw-codebundle-agent +Metadata Display Name AWS SQS Dead Letter Queue Health and Log Correlation +Metadata Supports AWS SQS DLQ CloudWatch Lambda +Force Tags AWS SQS DLQ CloudWatch Lambda + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Check Dead Letter Queue Depth and Redrive Configuration for Scope `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + [Documentation] Lists source queues, resolves RedrivePolicy to DLQs (deduped by DLQ ARN), compares ApproximateNumberOfMessages to DEAD_LETTER_MESSAGE_THRESHOLD, and emits structured issues when depth is exceeded. + [Tags] AWS SQS DLQ Redrive Metrics access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=sqs_dlq_depth_and_redrive.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./sqs_dlq_depth_and_redrive.sh + + RW.Core.Add Pre To Report ${result.stdout} + + ${issues}= RW.CLI.Run Cli + ... cmd=cat dlq_depth_issues.json 2>/dev/null || echo [] + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for DLQ depth task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=DLQ approximate message count should stay at or below DEAD_LETTER_MESSAGE_THRESHOLD when the consumer is healthy + ... actual=DLQ depth or configuration issue detected for the scoped queues + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Sample Recent Dead Letter Messages for Diagnostics in Scope `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + [Documentation] Receives a bounded sample of DLQ messages with a short visibility timeout, returns visibility to zero, and records message attributes and body snippets for failure analysis. + [Tags] AWS SQS DLQ Messages Diagnostics access:read-only data:logs-bulk + + ${result}= RW.CLI.Run Bash File + ... bash_file=sqs_dlq_sample_messages.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./sqs_dlq_sample_messages.sh + + RW.Core.Add Pre To Report ${result.stdout} + + ${issues}= RW.CLI.Run Cli + ... cmd=cat dlq_sample_issues.json 2>/dev/null || echo [] + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for DLQ sample task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=DLQ messages should be rare when processing succeeds; samples should only appear during active incidents + ... actual=Sampled DLQ payloads and attributes are available for triage + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Correlate DLQ to Lambda Consumer CloudWatch Logs for Scope `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + [Documentation] For Lambda event source mappings on source queues, searches CloudWatch Logs in the lookback window for ERROR, task timeouts, and related failures. Degrades gracefully when the consumer is not Lambda. + [Tags] AWS SQS DLQ Lambda CloudWatch Logs access:read-only data:logs-regexp + + ${result}= RW.CLI.Run Bash File + ... bash_file=sqs_dlq_lambda_consumer_logs.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./sqs_dlq_lambda_consumer_logs.sh + + RW.Core.Add Pre To Report ${result.stdout} + + ${issues}= RW.CLI.Run Cli + ... cmd=cat dlq_lambda_logs_issues.json 2>/dev/null || echo [] + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for Lambda logs task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Lambda consumers should process without repeated errors during the lookback window when the queue is healthy + ... actual=Lambda log correlation found errors or a non-Lambda consumer was detected while DLQ had messages + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + +Collect Source Queue CloudWatch Metrics for Context in Scope `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + [Documentation] Pulls AWS/SQS CloudWatch metrics for source queues to distinguish backlog growth from poison-message patterns and raises informational issues when oldest-message age is elevated. + [Tags] AWS SQS CloudWatch Metrics access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=sqs_source_queue_metrics.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./sqs_source_queue_metrics.sh + + RW.Core.Add Pre To Report ${result.stdout} + + ${issues}= RW.CLI.Run Cli + ... cmd=cat sqs_source_metrics_issues.json 2>/dev/null || echo [] + ... env=${env} + ... timeout_seconds=60 + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for source metrics task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Source queue ApproximateAgeOfOldestMessage should remain low when consumers keep up with traffic + ... actual=Elevated oldest-message age or related backlog signal detected in CloudWatch metrics + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + +*** Keywords *** +Suite Initialization + ${aws_credentials}= RW.Core.Import Secret + ... aws_credentials + ... type=string + ... description=AWS credentials from the workspace (from aws-auth block; e.g. aws:access_key@cli, aws:irsa@cli). + ... pattern=\w* + + ${AWS_REGION}= RW.Core.Import User Variable AWS_REGION + ... type=string + ... description=AWS region containing the queues + ... pattern=\w* + ${AWS_ACCOUNT_NAME}= RW.Core.Import User Variable AWS_ACCOUNT_NAME + ... type=string + ... description=Account display name for reports + ... pattern=.* + ... default=Unknown + ${SQS_QUEUE_URL}= RW.Core.Import User Variable SQS_QUEUE_URL + ... type=string + ... description=Optional single source queue URL (used with discovery qualifiers) + ... pattern=.* + ... default= + ${SQS_QUEUE_URLS}= RW.Core.Import User Variable SQS_QUEUE_URLS + ... type=string + ... description=Comma-separated source queue URLs; empty uses discovery with optional prefix + ... pattern=.* + ... default= + ${SQS_QUEUE_NAME_PREFIX}= RW.Core.Import User Variable SQS_QUEUE_NAME_PREFIX + ... type=string + ... description=Optional prefix filter for aws sqs list-queues discovery + ... pattern=.* + ... default= + ${DEAD_LETTER_MESSAGE_THRESHOLD}= RW.Core.Import User Variable DEAD_LETTER_MESSAGE_THRESHOLD + ... type=string + ... description=Open an issue when DLQ approximate message count exceeds this integer + ... pattern=\d+ + ... default=0 + ${CLOUDWATCH_LOG_LOOKBACK_MINUTES}= RW.Core.Import User Variable CLOUDWATCH_LOG_LOOKBACK_MINUTES + ... type=string + ... description=Lookback window for Lambda log search and metric alignment + ... pattern=\d+ + ... default=30 + ${MAX_DLQ_MESSAGES_TO_SAMPLE}= RW.Core.Import User Variable MAX_DLQ_MESSAGES_TO_SAMPLE + ... type=string + ... description=Maximum DLQ messages to receive per run for diagnostics + ... pattern=\d+ + ... default=5 + + Set Suite Variable ${AWS_REGION} ${AWS_REGION} + Set Suite Variable ${AWS_ACCOUNT_NAME} ${AWS_ACCOUNT_NAME} + Set Suite Variable ${SQS_QUEUE_URL} ${SQS_QUEUE_URL} + Set Suite Variable ${SQS_QUEUE_URLS} ${SQS_QUEUE_URLS} + Set Suite Variable ${SQS_QUEUE_NAME_PREFIX} ${SQS_QUEUE_NAME_PREFIX} + Set Suite Variable ${DEAD_LETTER_MESSAGE_THRESHOLD} ${DEAD_LETTER_MESSAGE_THRESHOLD} + Set Suite Variable ${CLOUDWATCH_LOG_LOOKBACK_MINUTES} ${CLOUDWATCH_LOG_LOOKBACK_MINUTES} + Set Suite Variable ${MAX_DLQ_MESSAGES_TO_SAMPLE} ${MAX_DLQ_MESSAGES_TO_SAMPLE} + Set Suite Variable ${aws_credentials} ${aws_credentials} + + ${env}= Create Dictionary + ... AWS_REGION=${AWS_REGION} + ... AWS_ACCOUNT_NAME=${AWS_ACCOUNT_NAME} + ... SQS_QUEUE_URL=${SQS_QUEUE_URL} + ... SQS_QUEUE_URLS=${SQS_QUEUE_URLS} + ... SQS_QUEUE_NAME_PREFIX=${SQS_QUEUE_NAME_PREFIX} + ... DEAD_LETTER_MESSAGE_THRESHOLD=${DEAD_LETTER_MESSAGE_THRESHOLD} + ... CLOUDWATCH_LOG_LOOKBACK_MINUTES=${CLOUDWATCH_LOG_LOOKBACK_MINUTES} + ... MAX_DLQ_MESSAGES_TO_SAMPLE=${MAX_DLQ_MESSAGES_TO_SAMPLE} + Set Suite Variable ${env} ${env} diff --git a/codebundles/aws-sqs-dlq-health/sqs_dlq_common.sh b/codebundles/aws-sqs-dlq-health/sqs_dlq_common.sh new file mode 100755 index 00000000..ce74b8da --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/sqs_dlq_common.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Shared helpers for aws-sqs-dlq-health (sourced by task scripts). + +# Collect source queue URLs: explicit SQS_QUEUE_URL / SQS_QUEUE_URLS or list-queues with optional prefix. +# Prints one URL per line. Returns non-zero if list-queues fails when discovery is used. +rw_sqs_collect_source_urls() { + local merged=() + if [[ -n "${SQS_QUEUE_URL:-}" ]]; then + merged+=("${SQS_QUEUE_URL}") + fi + if [[ -n "${SQS_QUEUE_URLS:-}" ]]; then + local IFS_orig="$IFS" + IFS=',' read -ra PARTS <<< "${SQS_QUEUE_URLS}" + IFS="$IFS_orig" + for p in "${PARTS[@]}"; do + p=$(echo "$p" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + [[ -z "$p" ]] && continue + merged+=("$p") + done + fi + if [[ ${#merged[@]} -gt 0 ]]; then + printf '%s\n' "${merged[@]}" + return 0 + fi + local prefix="${SQS_QUEUE_NAME_PREFIX:-}" + local raw + if ! raw=$(aws sqs list-queues --region "$AWS_REGION" ${prefix:+--queue-name-prefix "$prefix"} --output json 2>/dev/null); then + return 1 + fi + echo "$raw" | jq -r '.QueueUrls[]? // empty' +} + +# From a source queue URL, print "dlq_arn|dlq_url" or nothing. Uses RedrivePolicy on the source queue. +rw_sqs_resolve_dlq_url() { + local qurl="$1" + local attrs + attrs=$(aws sqs get-queue-attributes --queue-url "$qurl" --attribute-names RedrivePolicy --output json 2>/dev/null || echo '{}') + local policy + policy=$(echo "$attrs" | jq -r '.Attributes.RedrivePolicy // empty') + [[ -z "$policy" || "$policy" == "null" ]] && return 1 + local dlq_arn + dlq_arn=$(echo "$policy" | jq -r '.deadLetterTargetArn // empty') + [[ -z "$dlq_arn" ]] && return 1 + local dlq_name + dlq_name=$(echo "$dlq_arn" | awk -F: '{print $NF}') + local dlq_url + dlq_url=$(aws sqs get-queue-url --queue-name "$dlq_name" --region "$AWS_REGION" --output json 2>/dev/null | jq -r '.QueueUrl // empty') + [[ -z "$dlq_url" ]] && return 1 + printf '%s|%s\n' "$dlq_arn" "$dlq_url" +} + +# Approximate message count for a queue URL. +rw_sqs_queue_depth() { + local qurl="$1" + aws sqs get-queue-attributes --queue-url "$qurl" --attribute-names ApproximateNumberOfMessages --output json 2>/dev/null \ + | jq -r '.Attributes.ApproximateNumberOfMessages // "0"' +} + +# Queue name segment from an SQS HTTPS URL (last path component, URL-decoded for .fifo). +rw_sqs_queue_name_from_url() { + local qurl="$1" + echo "$qurl" | awk -F/ '{print $NF}' | sed 's/%20/ /g' +} diff --git a/codebundles/aws-sqs-dlq-health/sqs_dlq_depth_and_redrive.sh b/codebundles/aws-sqs-dlq-health/sqs_dlq_depth_and_redrive.sh new file mode 100755 index 00000000..8017e4d9 --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/sqs_dlq_depth_and_redrive.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# Lists source queues, resolves RedrivePolicy DLQs, dedupes by DLQ ARN, and opens issues when DLQ depth exceeds DEAD_LETTER_MESSAGE_THRESHOLD. +# Writes JSON issues to dlq_depth_issues.json (jq). + +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/auth.sh" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/sqs_dlq_common.sh" + +auth + +: "${AWS_REGION:?Must set AWS_REGION}" + +OUTPUT_FILE="dlq_depth_issues.json" +THRESHOLD="${DEAD_LETTER_MESSAGE_THRESHOLD:-0}" +issues_json='[]' + +echo "=== SQS DLQ depth and redrive (region=${AWS_REGION}, threshold=${THRESHOLD}) ===" + +SOURCE_URLS=() +if [[ -n "${SQS_QUEUE_URL:-}" || -n "${SQS_QUEUE_URLS:-}" ]]; then + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(rw_sqs_collect_source_urls) +else + prefix="${SQS_QUEUE_NAME_PREFIX:-}" + if ! raw=$(aws sqs list-queues --region "$AWS_REGION" ${prefix:+--queue-name-prefix "$prefix"} --output json 2>/dev/null); then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot list SQS queues in \`${AWS_REGION}\`" \ + --arg details "aws sqs list-queues failed. Check IAM (sqs:ListQueues) and region." \ + --argjson severity 4 \ + --arg next_steps "Verify AWS credentials, region, and sqs:ListQueues permission." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 + fi + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(echo "$raw" | jq -r '.QueueUrls[]? // empty') +fi + +if [[ ${#SOURCE_URLS[@]} -eq 0 ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "No source SQS queues to evaluate" \ + --arg details "Set SQS_QUEUE_URL, SQS_QUEUE_URLS, or use discovery with SQS_QUEUE_NAME_PREFIX so at least one queue is in scope." \ + --argjson severity 3 \ + --arg next_steps "Configure SQS_QUEUE_URLS or adjust SQS_QUEUE_NAME_PREFIX / discovery rules." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +declare -A SEEN_DLQ + +for src in "${SOURCE_URLS[@]}"; do + [[ -z "$src" ]] && continue + echo "--- Source queue: ${src}" + attrs=$(aws sqs get-queue-attributes --queue-url "$src" --attribute-names QueueArn,RedrivePolicy --output json 2>/dev/null || echo '{}') + qarn=$(echo "$attrs" | jq -r '.Attributes.QueueArn // empty') + policy=$(echo "$attrs" | jq -r '.Attributes.RedrivePolicy // empty') + if [[ -z "$policy" || "$policy" == "null" ]]; then + echo "No RedrivePolicy on source queue (no DLQ configured via SQS redrive)." + continue + fi + resolved=$(rw_sqs_resolve_dlq_url "$src" || true) + if [[ -z "$resolved" ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot resolve DLQ for source queue" \ + --arg details "Source: ${src}. RedrivePolicy present but get-queue-url failed (wrong account/region or IAM sqs:GetQueueUrl)." \ + --argjson severity 3 \ + --arg next_steps "Verify DLQ exists in this account/region and IAM allows sqs:GetQueueUrl on the DLQ." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + continue + fi + dlq_arn="${resolved%%|*}" + dlq_url="${resolved##*|}" + if [[ -n "${SEEN_DLQ[$dlq_arn]:-}" ]]; then + echo "Skipping duplicate DLQ ARN ${dlq_arn}" + continue + fi + SEEN_DLQ[$dlq_arn]=1 + + depth=$(rw_sqs_queue_depth "$dlq_url") + echo "DLQ ${dlq_url} depth=${depth}" + + if [[ "$depth" =~ ^[0-9]+$ ]] && [[ "$depth" -gt "$THRESHOLD" ]]; then + sev=2 + if [[ "$depth" -gt 1000 ]]; then + sev=3 + fi + max_recv=$(echo "$policy" | jq -r '.maxReceiveCount // "unknown"') + issues_json=$(echo "$issues_json" | jq \ + --arg title "DLQ message depth exceeds threshold for \`${dlq_arn}\`" \ + --arg details "ApproximateNumberOfMessages on DLQ: ${depth}. Threshold (DEAD_LETTER_MESSAGE_THRESHOLD): ${THRESHOLD}. Source queue ARN: ${qarn}. Redrive maxReceiveCount: ${max_recv}." \ + --argjson severity "$sev" \ + --arg next_steps "Inspect consumer failures (Lambda logs, ECS tasks, or workers). Sample messages, fix poison payloads, then redrive or purge as appropriate. See DLQ sample and Lambda logs tasks in this bundle." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + fi +done + +echo "$issues_json" > "$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE}" +exit 0 diff --git a/codebundles/aws-sqs-dlq-health/sqs_dlq_lambda_consumer_logs.sh b/codebundles/aws-sqs-dlq-health/sqs_dlq_lambda_consumer_logs.sh new file mode 100755 index 00000000..d481ffea --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/sqs_dlq_lambda_consumer_logs.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# Resolves Lambda event source mappings for source queues and searches CloudWatch Logs for errors in the lookback window. +# Writes JSON issues to dlq_lambda_logs_issues.json (jq). + +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/auth.sh" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/sqs_dlq_common.sh" + +auth + +: "${AWS_REGION:?Must set AWS_REGION}" + +OUTPUT_FILE="dlq_lambda_logs_issues.json" +LOOKBACK="${CLOUDWATCH_LOG_LOOKBACK_MINUTES:-30}" +issues_json='[]' +start_ms=$(($(date +%s) - LOOKBACK * 60))000 + +echo "=== Lambda consumer logs (lookback ${LOOKBACK} minutes, start_ms=${start_ms}) ===" + +SOURCE_URLS=() +if [[ -n "${SQS_QUEUE_URL:-}" || -n "${SQS_QUEUE_URLS:-}" ]]; then + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(rw_sqs_collect_source_urls) +else + prefix="${SQS_QUEUE_NAME_PREFIX:-}" + raw=$(aws sqs list-queues --region "$AWS_REGION" ${prefix:+--queue-name-prefix "$prefix"} --output json 2>/dev/null || echo '{}') + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(echo "$raw" | jq -r '.QueueUrls[]? // empty') +fi + +if [[ ${#SOURCE_URLS[@]} -eq 0 ]]; then + echo '[]' > "$OUTPUT_FILE" + exit 0 +fi + +for src in "${SOURCE_URLS[@]}"; do + [[ -z "$src" ]] && continue + qarn=$(aws sqs get-queue-attributes --queue-url "$src" --attribute-names QueueArn --output json 2>/dev/null | jq -r '.Attributes.QueueArn // empty') + [[ -z "$qarn" ]] && continue + + dlq_depth=0 + if resolved=$(rw_sqs_resolve_dlq_url "$src"); then + dlq_url="${resolved##*|}" + d=$(rw_sqs_queue_depth "$dlq_url") + [[ "$d" =~ ^[0-9]+$ ]] && dlq_depth="$d" + fi + + mappings_json=$(aws lambda list-event-source-mappings --event-source-arn "$qarn" --region "$AWS_REGION" --output json 2>/dev/null || echo '{"EventSourceMappings":[]}') + map_count=$(echo "$mappings_json" | jq '.EventSourceMappings | length // 0') + + if [[ "$map_count" -eq 0 ]]; then + if [[ "$dlq_depth" -gt 0 ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "No Lambda event source mapping for queue with DLQ traffic" \ + --arg details "Queue ${src} has no Lambda event source mappings but DLQ approximate depth is ${dlq_depth}. Consumer may be ECS, EC2, or another service—correlate using that platform's logs." \ + --argjson severity 3 \ + --arg next_steps "Find the active consumer for this queue (non-Lambda), inspect its logs and metrics, and fix processing failures. If the consumer should be Lambda, create or fix the event source mapping." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + fi + continue + fi + + while IFS= read -r row; do + [[ -z "$row" ]] && continue + fn_arn=$(echo "$row" | jq -r '.FunctionArn // empty') + [[ -z "$fn_arn" ]] && continue + fn_name="${fn_arn#*:function:}" + + log_group="/aws/lambda/${fn_name}" + lg_name=$(aws logs describe-log-groups --log-group-name-prefix "$log_group" --region "$AWS_REGION" --output json 2>/dev/null | jq -r '.logGroups[0].logGroupName // empty') + if [[ -z "$lg_name" ]]; then + echo "Log group not found for ${log_group}" + continue + fi + + fe=$(aws logs filter-log-events \ + --log-group-name "$lg_name" \ + --start-time "$start_ms" \ + --filter-pattern "?ERROR ?Error ?Task ?timed" \ + --limit 50 \ + --region "$AWS_REGION" \ + --output json 2>/dev/null || echo '{}') + + ev_count=$(echo "$fe" | jq '.events | length // 0') + if [[ "$ev_count" -eq 0 ]]; then + echo "No matching log events for ${fn_name} in window." + continue + fi + + details_safe=$(echo "$fe" | jq -c '{matchingEvents: (.events | length), sample: [.events[]? | {timestamp: .timestamp, message: .message}] | .[0:5]}') + + issues_json=$(echo "$issues_json" | jq \ + --arg title "Lambda \`${fn_name}\` logs show errors overlapping DLQ window" \ + --arg details "$details_safe" \ + --argjson severity 3 \ + --arg next_steps "Open CloudWatch Logs for ${lg_name}, inspect the stack traces, fix the Lambda code or configuration (timeout, memory, permissions), then redrive DLQ messages after verification." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + done < <(echo "$mappings_json" | jq -c '.EventSourceMappings[]? // empty') +done + +echo "$issues_json" > "$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE}" +exit 0 diff --git a/codebundles/aws-sqs-dlq-health/sqs_dlq_sample_messages.sh b/codebundles/aws-sqs-dlq-health/sqs_dlq_sample_messages.sh new file mode 100755 index 00000000..a2ed74c7 --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/sqs_dlq_sample_messages.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +# Receives a bounded sample of DLQ messages (short visibility), extracts diagnostics, returns visibility to 0. +# Writes JSON issues to dlq_sample_issues.json (jq). + +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/auth.sh" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/sqs_dlq_common.sh" + +auth + +: "${AWS_REGION:?Must set AWS_REGION}" + +OUTPUT_FILE="dlq_sample_issues.json" +MAX_SAMPLE="${MAX_DLQ_MESSAGES_TO_SAMPLE:-5}" +VIS_SECONDS="${DLQ_SAMPLE_VISIBILITY_SECONDS:-10}" +issues_json='[]' + +echo "=== DLQ message sample (max ${MAX_SAMPLE} messages total per run, visibility ${VIS_SECONDS}s) ===" + +SOURCE_URLS=() +if [[ -n "${SQS_QUEUE_URL:-}" || -n "${SQS_QUEUE_URLS:-}" ]]; then + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(rw_sqs_collect_source_urls) +else + prefix="${SQS_QUEUE_NAME_PREFIX:-}" + raw=$(aws sqs list-queues --region "$AWS_REGION" ${prefix:+--queue-name-prefix "$prefix"} --output json 2>/dev/null || echo '{}') + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(echo "$raw" | jq -r '.QueueUrls[]? // empty') +fi + +if [[ ${#SOURCE_URLS[@]} -eq 0 ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "No source queues for DLQ sampling" \ + --arg details "Provide SQS_QUEUE_URL / SQS_QUEUE_URLS or list-queues discovery." \ + --argjson severity 4 \ + --arg next_steps "Configure queue URLs or prefix discovery." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +declare -A SEEN_DLQ +declare -a DLQ_URLS=() + +for src in "${SOURCE_URLS[@]}"; do + [[ -z "$src" ]] && continue + resolved=$(rw_sqs_resolve_dlq_url "$src" || true) + [[ -z "$resolved" ]] && continue + dlq_arn="${resolved%%|*}" + dlq_url="${resolved##*|}" + if [[ -n "${SEEN_DLQ[$dlq_arn]:-}" ]]; then + continue + fi + SEEN_DLQ[$dlq_arn]=1 + DLQ_URLS+=("$dlq_url") +done + +if [[ ${#DLQ_URLS[@]} -eq 0 ]]; then + echo "No DLQs resolved from source queues (missing RedrivePolicy)." + echo '[]' > "$OUTPUT_FILE" + exit 0 +fi + +total_sampled=0 +for dlq_url in "${DLQ_URLS[@]}"; do + [[ "$total_sampled" -ge "$MAX_SAMPLE" ]] && break + depth=$(rw_sqs_queue_depth "$dlq_url") + if [[ ! "$depth" =~ ^[0-9]+$ ]] || [[ "$depth" -eq 0 ]]; then + echo "DLQ ${dlq_url} empty; skipping receive." + continue + fi + + while [[ "$total_sampled" -lt "$MAX_SAMPLE" ]]; do + batch=$((MAX_SAMPLE - total_sampled > 10 ? 10 : MAX_SAMPLE - total_sampled)) + resp=$(aws sqs receive-message \ + --queue-url "$dlq_url" \ + --max-number-of-messages "$batch" \ + --visibility-timeout "$VIS_SECONDS" \ + --attribute-names All \ + --message-attribute-names All \ + --output json 2>/dev/null || echo '{}') + + cnt=$(echo "$resp" | jq '.Messages | length // 0') + if [[ "$cnt" -eq 0 ]]; then + break + fi + + while IFS= read -r msg; do + [[ -z "$msg" ]] && continue + mid=$(echo "$msg" | jq -r '.MessageId // empty') + body=$(echo "$msg" | jq -r '.Body // ""' | head -c 4000) + rh=$(echo "$msg" | jq -r '.ReceiptHandle // empty') + attrs=$(echo "$msg" | jq -c '.Attributes // {}') + mattrs=$(echo "$msg" | jq -c '.MessageAttributes // {}') + + if [[ -n "$rh" ]]; then + aws sqs change-message-visibility --queue-url "$dlq_url" --receipt-handle "$rh" --visibility-timeout 0 --region "$AWS_REGION" >/dev/null 2>&1 || true + fi + + total_sampled=$((total_sampled + 1)) + + details_json=$(jq -n \ + --arg mid "$mid" \ + --arg body "$body" \ + --argjson attrs "$attrs" \ + --argjson mattrs "$mattrs" \ + '{messageId: $mid, bodySnippet: $body, attributes: $attrs, messageAttributes: $mattrs}') + details_str=$(echo "$details_json" | jq -c .) + + issues_json=$(echo "$issues_json" | jq \ + --arg title "DLQ diagnostic sample for \`${dlq_url}\`" \ + --arg details "$details_str" \ + --argjson severity 3 \ + --arg next_steps "Review body for Lambda async failure fields, application errors, or poison payloads. Fix consumer logic; use redrive or delete after remediation. FIFO: avoid partial batch pitfalls if using partial batch responses." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + + if [[ "$total_sampled" -ge "$MAX_SAMPLE" ]]; then + break 3 + fi + done < <(echo "$resp" | jq -c '.Messages[]? // empty') + done +done + +echo "$issues_json" > "$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE}" +exit 0 diff --git a/codebundles/aws-sqs-dlq-health/sqs_source_queue_metrics.sh b/codebundles/aws-sqs-dlq-health/sqs_source_queue_metrics.sh new file mode 100755 index 00000000..30d800fb --- /dev/null +++ b/codebundles/aws-sqs-dlq-health/sqs_source_queue_metrics.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Pulls CloudWatch metrics for source queues (age, sent, deleted) to distinguish backlog vs poison patterns. +# Writes JSON issues to sqs_source_metrics_issues.json when oldest-message age is elevated (jq). + +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/auth.sh" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/sqs_dlq_common.sh" + +auth + +: "${AWS_REGION:?Must set AWS_REGION}" + +OUTPUT_FILE="sqs_source_metrics_issues.json" +LOOKBACK="${CLOUDWATCH_LOG_LOOKBACK_MINUTES:-30}" +issues_json='[]' + +START=$(date -u -d "${LOOKBACK} minutes ago" +%FT%TZ) +END=$(date -u +%FT%TZ) + +echo "=== Source queue CloudWatch metrics (${START} to ${END}, region=${AWS_REGION}) ===" + +SOURCE_URLS=() +if [[ -n "${SQS_QUEUE_URL:-}" || -n "${SQS_QUEUE_URLS:-}" ]]; then + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(rw_sqs_collect_source_urls) +else + prefix="${SQS_QUEUE_NAME_PREFIX:-}" + raw=$(aws sqs list-queues --region "$AWS_REGION" ${prefix:+--queue-name-prefix "$prefix"} --output json 2>/dev/null || echo '{}') + while IFS= read -r line; do + [[ -n "$line" ]] && SOURCE_URLS+=("$line") + done < <(echo "$raw" | jq -r '.QueueUrls[]? // empty') +fi + +if [[ ${#SOURCE_URLS[@]} -eq 0 ]]; then + echo '[]' > "$OUTPUT_FILE" + exit 0 +fi + +for src in "${SOURCE_URLS[@]}"; do + [[ -z "$src" ]] && continue + qname=$(rw_sqs_queue_name_from_url "$src") + echo "--- Queue: ${qname}" + + for metric in ApproximateAgeOfOldestMessage NumberOfMessagesSent NumberOfMessagesDeleted; do + stats=$(aws cloudwatch get-metric-statistics \ + --namespace AWS/SQS \ + --metric-name "$metric" \ + --dimensions Name=QueueName,Value="$qname" \ + --start-time "$START" \ + --end-time "$END" \ + --period 60 \ + --statistics Maximum Average \ + --region "$AWS_REGION" \ + --output json 2>/dev/null || echo '{}') + echo "${metric}: $(echo "$stats" | jq -c '{Datapoints: .Datapoints}')" + done + + age_stats=$(aws cloudwatch get-metric-statistics \ + --namespace AWS/SQS \ + --metric-name ApproximateAgeOfOldestMessage \ + --dimensions Name=QueueName,Value="$qname" \ + --start-time "$START" \ + --end-time "$END" \ + --period 60 \ + --statistics Maximum \ + --region "$AWS_REGION" \ + --output json 2>/dev/null || echo '{}') + + max_age=$(echo "$age_stats" | jq '[.Datapoints[]?.Maximum // empty] | max // 0') + # max_age may be float + if awk -v m="$max_age" 'BEGIN { exit !(m > 300) }'; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Elevated ApproximateAgeOfOldestMessage for source queue \`${qname}\`" \ + --arg details "Maximum ApproximateAgeOfOldestMessage in window ≈ ${max_age}s (threshold 300s). Review consumer throughput vs poison messages; compare with DLQ samples." \ + --argjson severity 4 \ + --arg next_steps "Scale or fix the consumer if backlog-driven; if age is high with low DLQ volume, investigate slow processing. Cross-check NumberOfMessagesSent vs NumberOfMessagesDeleted in the report output." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + fi +done + +echo "$issues_json" > "$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE}" +exit 0