runwhen-contrib · rw-codebundle-agent · Apr 7, 2026
@@ -0,0 +1,21 @@
+apiVersion: runwhen.com/v1
+kind: GenerationRules
+spec:
+  platform: aws
+  generationRules:
+    - resourceTypes:
+        - aws_sqs_queues
+      matchRules:
+        - type: pattern
+          pattern: ".+"
+          properties: ["name"]
+          mode: substring
+      slxs:
+        - baseName: aws-sqs-dlq-health
+          qualifiers: ["account", "region", "resource"]
+          baseTemplateName: aws-sqs-dlq-health
+          levelOfDetail: basic
+          outputItems:
+            - type: slx
+            - type: runbook
+              templateName: aws-sqs-dlq-health-taskset.yaml
@@ -0,0 +1,31 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelX
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/aws/lambda.png
+  alias: "{{match_resource.account_name}} SQS DLQ Health — {{match_resource.resource.region}}"
+  asMeasuredBy: "DLQ message depth, sampled payloads, Lambda log errors, and source queue CloudWatch metrics for matched SQS queues."
+  configProvided:
+    - name: SLX_PLACEHOLDER
+      value: SLX_PLACEHOLDER
+  owners:
+    - {{workspace.owner_email}}
+  statement: Dead-letter queues should stay near zero; sustained DLQ depth indicates consumer or message-level failures that need investigation.
+  additionalContext:
+    {% include "aws-hierarchy.yaml" ignore missing %}
+    qualified_name: "{{ match_resource.qualified_name | replace(":", "_") }}"
+  tags:
+    {% include "aws-tags.yaml" ignore missing %}
+    - name: cloud
+      value: aws
+    - name: service
+      value: sqs
+    - name: scope
+      value: resource
+    - name: access
+      value: read-only
@@ -0,0 +1,42 @@
+apiVersion: runwhen.com/v1
+kind: Runbook
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  location: {{default_location}}
+  description: Monitors SQS DLQs, samples messages, correlates Lambda logs, and pulls source queue metrics for {{match_resource.account_name}} in {{match_resource.resource.region}}.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/aws-sqs-dlq-health/runbook.robot
+  configProvided:
+    - name: AWS_REGION
+      value: "{{match_resource.resource.region}}"
+    - name: AWS_ACCOUNT_NAME
+      value: "{{match_resource.account_name}}"
+    - name: SQS_QUEUE_URL
+      value: "{{ match_resource.resource.queue_url | default('') }}"
+    - name: SQS_QUEUE_URLS
+      value: "{{ match_resource.resource.queue_url | default('') }}"
+    - name: SQS_QUEUE_NAME_PREFIX
+      value: ""
+    - name: DEAD_LETTER_MESSAGE_THRESHOLD
+      value: "0"
+    - name: CLOUDWATCH_LOG_LOOKBACK_MINUTES
+      value: "30"
+    - name: MAX_DLQ_MESSAGES_TO_SAMPLE
+      value: "5"
+  secretsProvided:
+    {% include "aws-auth.yaml" ignore missing %}
@@ -0,0 +1,54 @@
+# AWS SQS Dead Letter Queue Health and Log Correlation
+
+This CodeBundle monitors Amazon SQS dead-letter queues (DLQs) tied to source queues via `RedrivePolicy`, raises issues when messages accumulate past a threshold, samples recent DLQ payloads for diagnostics, correlates Lambda event source consumers to CloudWatch Logs, and pulls source-queue CloudWatch metrics for backlog context.
+
+## Overview
+
+- **DLQ depth and redrive**: Discovers source queues (explicit URLs or `list-queues` with an optional prefix), resolves each DLQ from `RedrivePolicy`, dedupes checks by DLQ ARN, and compares `ApproximateNumberOfMessages` to `DEAD_LETTER_MESSAGE_THRESHOLD`.
+- **DLQ sampling**: Receives up to `MAX_DLQ_MESSAGES_TO_SAMPLE` messages per run with a short visibility timeout, extracts attributes and body snippets, then resets visibility to zero so messages remain available (delete is not used by default).
+- **Lambda logs**: Lists Lambda event source mappings for each source queue ARN and searches `/aws/lambda/<function>` log groups for error patterns in the lookback window. If the DLQ has traffic but there is no Lambda mapping, the task reports that non-Lambda consumers (ECS, EC2, etc.) need manual correlation.
+- **Source metrics**: Reads `AWS/SQS` metrics (`ApproximateAgeOfOldestMessage`, `NumberOfMessagesSent`, `NumberOfMessagesDeleted`) and flags sustained high oldest-message age.
+
+SQS does not expose Azure-style `DeadLetterReason` on the queue API; root cause typically comes from message bodies (for example Lambda failure payloads), consumer logs, or application attributes.
+
+## Configuration
+
+### Required Variables
+
+- `AWS_REGION`: AWS region containing the queues.
+
+### Optional Variables
+
+- `AWS_ACCOUNT_NAME`: Account display label for reports (default: `Unknown`).
+- `SQS_QUEUE_URL`: Optional single source queue URL (often supplied by discovery as a qualifier alongside `SQS_QUEUE_URLS`).
+- `SQS_QUEUE_URLS`: Comma-separated source queue URLs; when empty, discovery uses `aws sqs list-queues` with optional `SQS_QUEUE_NAME_PREFIX`.
+- `SQS_QUEUE_NAME_PREFIX`: Prefix passed to `list-queues` when no explicit URLs are set (default: empty).
+- `DEAD_LETTER_MESSAGE_THRESHOLD`: Open a DLQ depth issue when approximate message count is **greater than** this integer (default: `0`, meaning any message triggers when depth exceeds zero).
+- `CLOUDWATCH_LOG_LOOKBACK_MINUTES`: Window for Lambda log search and metric alignment (default: `30`).
+- `MAX_DLQ_MESSAGES_TO_SAMPLE`: Cap on DLQ messages to receive per run for diagnostics (default: `5`).
+
+### Secrets
+
+- `aws_credentials`: Standard RunWhen AWS credentials (`aws-auth` block): access keys, IRSA, or assume-role via workspace configuration.
+
+## Tasks Overview
+
+### Check Dead Letter Queue Depth and Redrive Configuration
+
+Evaluates DLQ depth against the threshold and surfaces redrive metadata (`maxReceiveCount`). Emits issues when depth is above the configured limit.
+
+### Sample Recent Dead Letter Messages for Diagnostics
+
+Pulls a bounded sample per run, returns visibility to zero after inspection, and emits structured issues containing message metadata and truncated bodies (including Lambda async failure shapes when present).
+
+### Correlate DLQ to Lambda Consumer CloudWatch Logs
+
+Finds Lambda functions subscribed to the source queue and scans recent log events for error-oriented patterns. If messages are in the DLQ but the consumer is not Lambda, the task explains the limitation and points operators to other platforms.
+
+### Collect Source Queue CloudWatch Metrics for Context
+
+Emits CloudWatch metric datapoints to the report and opens issues when `ApproximateAgeOfOldestMessage` stays above 300 seconds in the window (configurable only in script today; threshold is documented in task output).
+
+## IAM
+
+Typical read-only permissions: `sqs:GetQueueAttributes`, `sqs:ListQueues`, `sqs:ReceiveMessage` (DLQ), `sqs:GetQueueUrl`, `lambda:ListEventSourceMappings`, `logs:FilterLogEvents`, `logs:DescribeLogGroups`, `cloudwatch:GetMetricStatistics`.
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# AWS auth helper aligned with runwhen-local aws-auth template and runtime aws_utils.
+# Supports: explicit keys, assume role, IRSA, pod identity, default credential chain.
+# Credentials are injected by the platform from the aws-auth block; this script
+# optionally assumes a role when AWS_ROLE_ARN is set with base credentials, then
+# verifies authentication.
+#
+# Usage: source this file, then call `auth`
+#   source "$(dirname "$0")/auth.sh"
+#   auth
+
+_aws_verify() {
+    aws sts get-caller-identity --output json >/dev/null 2>&1
+}
+
+auth() {
+    # IRSA or EKS Pod Identity: no access keys required; runtime sets AWS_WEB_IDENTITY_TOKEN_FILE
+    # or AWS_CONTAINER_CREDENTIALS_FULL_URI. Just verify.
+    if [[ -n "${AWS_WEB_IDENTITY_TOKEN_FILE:-}" ]] || [[ -n "${AWS_CONTAINER_CREDENTIALS_FULL_URI:-}" ]]; then
+        if _aws_verify; then
+            return 0
+        fi
+        echo "AWS identity (IRSA/pod identity) present but get-caller-identity failed."
+        exit 1
+    fi
+
+    # Explicit credentials: if AWS_ROLE_ARN is set with base creds, assume the role
+    if [[ -n "${AWS_ACCESS_KEY_ID:-}" && -n "${AWS_SECRET_ACCESS_KEY:-}" ]]; then
+        if [[ -n "${AWS_ROLE_ARN:-}" ]]; then
+            sts_output=$(aws sts assume-role --role-arn "$AWS_ROLE_ARN" --role-session-name "AssumeRoleSession" --output json)
+            AWS_ACCESS_KEY_ID=$(echo "$sts_output" | jq -r '.Credentials.AccessKeyId')
+            AWS_SECRET_ACCESS_KEY=$(echo "$sts_output" | jq -r '.Credentials.SecretAccessKey')
+            AWS_SESSION_TOKEN=$(echo "$sts_output" | jq -r '.Credentials.SessionToken')
+            export AWS_ACCESS_KEY_ID
+            export AWS_SECRET_ACCESS_KEY
+            export AWS_SESSION_TOKEN
+        fi
+        if _aws_verify; then
+            return 0
+        fi
+        echo "AWS credentials set but get-caller-identity failed."
+        exit 1
+    fi
+
+    # Default credential chain (env, profile, instance metadata, etc.)
+    if _aws_verify; then
+        return 0
+    fi
+
+    echo "AWS credentials not configured. Set credentials via the platform aws-auth block (e.g. aws:access_key@cli, aws:irsa@cli, aws:default@cli)."
+    exit 1
+}