diff --git a/.github/workflows/deploy_dev.yml b/.github/workflows/deploy_dev_full.yml similarity index 97% rename from .github/workflows/deploy_dev.yml rename to .github/workflows/deploy_dev_full.yml index 36e1a969..d9c169e0 100644 --- a/.github/workflows/deploy_dev.yml +++ b/.github/workflows/deploy_dev_full.yml @@ -1,4 +1,4 @@ -name: Dev Deploy +name: Dev Deploy Infra and Lambda on: workflow_dispatch: diff --git a/.github/workflows/deploy_dev_lambda_only.yml b/.github/workflows/deploy_dev_lambda_only.yml new file mode 100644 index 00000000..82f2470b --- /dev/null +++ b/.github/workflows/deploy_dev_lambda_only.yml @@ -0,0 +1,49 @@ +name: Dev Deploy Lambda Only +# This allows us to deploy only lambda changes to dev without touching infra. + +on: + workflow_dispatch: + +permissions: + id-token: write + contents: write + +jobs: + setup: + runs-on: ubuntu-latest + outputs: + lambdas_dirs: ${{ steps.lambdas_dirs.outputs.just_outputs }} + steps: + - uses: actions/checkout@v4 + + - name: Get lambdas Directories + id: lambdas_dirs + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + with: + just_action: lambda-get-directories + + build: + uses: ./.github/workflows/build.yml + needs: + - setup + with: + environment: dev + version: ${{ github.sha }} + matrix: ${{ needs.setup.outputs.lambdas_dirs }} + + get_build: + needs: build + uses: ./.github/workflows/build_get.yml + with: + environment: dev + version: ${{ github.sha }} + + deploy: + uses: ./.github/workflows/deploy.yml + needs: + - get_build + with: + environment: dev + lambda_version: ${{ needs.get_build.outputs.lambda_version }} + lambda_bucket: ${{ needs.get_build.outputs.lambda_bucket }} + matrix: ${{ needs.get_build.outputs.lambda_version_files }} diff --git a/.github/workflows/deploy_prod.yml b/.github/workflows/deploy_prod_full.yml similarity index 96% rename from .github/workflows/deploy_prod.yml rename to .github/workflows/deploy_prod_full.yml index bd181768..70db1854 100644 --- a/.github/workflows/deploy_prod.yml +++ b/.github/workflows/deploy_prod_full.yml @@ -1,4 +1,4 @@ -name: Prod Deploy +name: Prod Deploy Infra and Lambda on: workflow_dispatch: diff --git a/.github/workflows/deploy_prod_lambda_only.yml b/.github/workflows/deploy_prod_lambda_only.yml new file mode 100644 index 00000000..fe447c52 --- /dev/null +++ b/.github/workflows/deploy_prod_lambda_only.yml @@ -0,0 +1,28 @@ +name: Prod Deploy Lambda Only +# This allows us to deploy only lambda changes to prod found in the version s3 folder (only). + +on: + workflow_dispatch: + +permissions: + id-token: write + contents: write + +jobs: + get_build: + uses: ./.github/workflows/build_get.yml + with: + environment: ci + version: 0.5.1 + + deploy: + uses: ./.github/workflows/deploy.yml + needs: + - get_build + with: + environment: prod + lambda_version: ${{ needs.get_build.outputs.lambda_version }} + lambda_bucket: ${{ needs.get_build.outputs.lambda_bucket }} + matrix: ${{ needs.get_build.outputs.lambda_version_files }} + # we can also define a lambda-only deployment here if needed, as below + # matrix: '["api"]' \ No newline at end of file diff --git a/README.md b/README.md index 8eb2bde0..f13a8d2e 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,8 @@ module "lambda_example" { - we can handle an initial lag while lambda warms up/boots ```hcl provisioned_config = { - fixed = 0 + fixed = 0 + reserved_concurrency = 2 # only allow 2 concurrent executions THIS ALSO SERVES AS A LIMIT TO AVOID THROTTLING } ``` @@ -45,7 +46,8 @@ provisioned_config = { - we never want lag due to warm up and can predict traffic ```hcl provisioned_config = { - fixed = 1 + fixed = 10 + reserved_concurrency = 50 } ``` @@ -65,12 +67,19 @@ provisioned_config = { } } ``` +- before scaling the lambda alias will match the minmum value +![a](docs/lambda-config-before.png) +- when the trigger percent is exceeded the lambda moves into `In progress (1/2)` state as an additional provisioned lambda is added. +![a](docs/lamba-scaling-up.png) +- after scaling the lambda alias will show an additional provisioned lambda +![a](docs/lambda-config-after.png) + ## đŸšĻ types of lambda deploy ```hcl module "lambda_example" { - source = "../lambda" + source = "../_shared/lambda" ... deployment_config = var.your_deployment_config } @@ -109,3 +118,41 @@ deployment_config = { percentage = 10 interval_minutes = 1 } +``` + +## đŸ”Ĩâ†Šī¸ deployment roll-back + +- use cloudwatch metrics and alarms to automatically roll-back a deployment +- create a [cloudwatch_metric_alarm](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) resource and pass in as per below + +```hcl +module "lambda_example" { + source = "../_shared/lambda" + ... + codedeploy_alarm_names = [ + local.api_5xx_alarm_name + ] +} +``` +- if the alarm triggers during a deployment you will see the below in the CI + +``` +đŸ“Ļ Running: lambda-deploy +🚀 Started deployment: d-40UUQH3DF +Attempt 1: Deployment status is InProgress +Attempt 2: Deployment status is InProgress +Attempt 3: Deployment status is InProgress +Attempt 4: Deployment status is InProgress +Attempt 5: Deployment status is Stopped +❌ Deployment d-40UUQH3DF failed or was stopped. +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +| GetDeployment | ++--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ErrorCode | ALARM_ACTIVE | +| ErrorMessage| One or more alarms have been activated according to the Amazon CloudWatch metrics you selected, and the affected deployments have been stopped. Activated alarms: | +| Status | Stopped | ++--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +error: Recipe `lambda-deploy` failed with exit code 1 +Error: Process completed with exit code 1. + +``` \ No newline at end of file diff --git a/docs/lamba-scaling-up.png b/docs/lamba-scaling-up.png new file mode 100644 index 00000000..21270203 Binary files /dev/null and b/docs/lamba-scaling-up.png differ diff --git a/docs/lambda-config-after.png b/docs/lambda-config-after.png new file mode 100644 index 00000000..14828714 Binary files /dev/null and b/docs/lambda-config-after.png differ diff --git a/docs/lambda-config-before.png b/docs/lambda-config-before.png new file mode 100644 index 00000000..6659eff9 Binary files /dev/null and b/docs/lambda-config-before.png differ diff --git a/infra/live/dev/aws/api/terragrunt.hcl b/infra/live/dev/aws/api/terragrunt.hcl index 4fc561db..37e4cc2f 100644 --- a/infra/live/dev/aws/api/terragrunt.hcl +++ b/infra/live/dev/aws/api/terragrunt.hcl @@ -2,6 +2,12 @@ include { path = find_in_parent_folders("root.hcl") } +inputs = { + api_5xx_alarm_threshold = 20.0 + api_5xx_alarm_evaluation_periods = 1 + api_5xx_alarm_datapoints_to_alarm = 1 +} + terraform { source = "../../../../modules//aws//api" } diff --git a/infra/live/prod/aws/api/terragrunt.hcl b/infra/live/prod/aws/api/terragrunt.hcl index 4fc561db..35f660c1 100644 --- a/infra/live/prod/aws/api/terragrunt.hcl +++ b/infra/live/prod/aws/api/terragrunt.hcl @@ -2,6 +2,12 @@ include { path = find_in_parent_folders("root.hcl") } +inputs = { + api_5xx_alarm_threshold = 5.0 + api_5xx_alarm_evaluation_periods = 1 + api_5xx_alarm_datapoints_to_alarm = 1 +} + terraform { source = "../../../../modules//aws//api" } diff --git a/infra/modules/aws/_shared/lambda/locals.tf b/infra/modules/aws/_shared/lambda/locals.tf index c7741a95..fac9ddb3 100644 --- a/infra/modules/aws/_shared/lambda/locals.tf +++ b/infra/modules/aws/_shared/lambda/locals.tf @@ -4,7 +4,7 @@ locals { compute_platform = "Lambda" lambda_bootstrap_zip_key = "bootstrap/bootstrap-lambda.zip" - lambda_name = "${var.environment}-${var.project_name}-${var.lambda_name}" + lambda_name = var.lambda_name deploy_all_at_once_type = "AllAtOnce" deploy_canary_type = "TimeBasedCanary" @@ -15,11 +15,18 @@ locals { canary = local.deploy_canary_type linear = local.deploy_linear_type } + deploy_strategy = local.deploy_config_type_map[var.deployment_config.strategy] deploy_config = { - type = local.deploy_config_type_map[var.deployment_config.strategy] + type = local.deploy_strategy percent = var.deployment_config.percentage minutes = var.deployment_config.interval_minutes } + deploy_config_suffix = lower(( + var.deployment_config.strategy == "all_at_once" + ? local.deploy_strategy + : "${local.deploy_strategy}-${local.deploy_config.percent}-${local.deploy_config.minutes}" + )) + deployment_config_name = "${local.lambda_name}-deploy-${local.deploy_config_suffix}" fixed_mode = try(var.provisioned_config.fixed != null, true) && try(var.provisioned_config.fixed > 0, false) auto_scale_mode = try(var.provisioned_config.auto_scale != null, false) diff --git a/infra/modules/aws/_shared/lambda/main.tf b/infra/modules/aws/_shared/lambda/main.tf index 44d5f246..e91b01a4 100644 --- a/infra/modules/aws/_shared/lambda/main.tf +++ b/infra/modules/aws/_shared/lambda/main.tf @@ -4,7 +4,7 @@ resource "aws_iam_role" "iam_for_lambda" { } resource "aws_iam_policy" "lambda_cloudwatch_logs" { - name = "${var.project_name}-${var.environment}-lambda-cloudwatch-logs" + name = "${local.lambda_name}-logs" policy = data.aws_iam_policy_document.lambda_cloudwatch_logs.json } @@ -108,7 +108,7 @@ resource "aws_iam_role_policy" "cd_lambda" { } resource "aws_codedeploy_deployment_config" "lambda_config" { - deployment_config_name = "${local.lambda_name}-deploy-config" + deployment_config_name = local.deployment_config_name compute_platform = local.compute_platform traffic_routing_config { @@ -133,8 +133,10 @@ resource "aws_codedeploy_deployment_config" "lambda_config" { } resource "aws_codedeploy_deployment_group" "dg" { + depends_on = [aws_codedeploy_deployment_config.lambda_config] # to prevent DeploymentConfigInUseException + app_name = aws_codedeploy_app.app.name - deployment_group_name = "${local.lambda_name}-dg" + deployment_group_name = "${local.deployment_config_name}-dg" service_role_arn = aws_iam_role.code_deploy_role.arn deployment_style { @@ -142,12 +144,24 @@ resource "aws_codedeploy_deployment_group" "dg" { deployment_option = "WITH_TRAFFIC_CONTROL" } - deployment_config_name = aws_codedeploy_deployment_config.lambda_config.deployment_config_name + deployment_config_name = local.deployment_config_name auto_rollback_configuration { enabled = true events = ["DEPLOYMENT_FAILURE", "DEPLOYMENT_STOP_ON_ALARM"] } + + dynamic "alarm_configuration" { + for_each = length(var.codedeploy_alarm_names) > 0 ? [1] : [] + content { + enabled = true + alarms = var.codedeploy_alarm_names + } + } + + lifecycle { + create_before_destroy = true # to prevent DeploymentConfigInUseException + } } resource "aws_appautoscaling_target" "pc_target" { diff --git a/infra/modules/aws/_shared/lambda/variables.tf b/infra/modules/aws/_shared/lambda/variables.tf index ada04226..96a2aec1 100644 --- a/infra/modules/aws/_shared/lambda/variables.tf +++ b/infra/modules/aws/_shared/lambda/variables.tf @@ -42,6 +42,12 @@ variable "additional_policy_arns" { default = [] } +variable "codedeploy_alarm_names" { + description = "Optional list of CloudWatch alarm names that trigger CodeDeploy rollback" + type = list(string) + default = [] +} + variable "deployment_config" { description = "Traffic shifting: all_at_once | canary | linear" type = object({ diff --git a/infra/modules/aws/api/local.tf b/infra/modules/aws/api/local.tf new file mode 100644 index 00000000..aade786d --- /dev/null +++ b/infra/modules/aws/api/local.tf @@ -0,0 +1,5 @@ +locals { + lambda_name = "${var.environment}-${var.project_name}-api" + apigw_http_5xx_metric = "5xx" + api_5xx_alarm_name = "${local.lambda_name}-api-v2-5xx-rate-critical" +} \ No newline at end of file diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index fd76eff9..93c91b10 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -5,36 +5,33 @@ module "lambda_api" { environment = var.environment lambda_bucket = var.lambda_bucket - lambda_name = "api" + lambda_name = local.lambda_name environment_variables = { DEBUG_DELAY_MS = 500 } deployment_config = { - strategy = "all_at_once" + strategy = "canary" + percentage = 10 + interval_minutes = 3 # this is > the alarm evaluation period to ensure we catch the alarm if it triggers } + codedeploy_alarm_names = [ + local.api_5xx_alarm_name + ] + provisioned_config = { - fixed = 0 # cold starts only - } + auto_scale = { + max = 2 + min = 1 # always have 1 lambda ready to go + trigger_percent = 20 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } - # provisioned_config = { - # fixed = 1 # always have 1 lambda ready to go - # reserved_concurrency = 2 # only allow 2 concurrent executions THIS ALSO SERVES AS A LIMIT TO AVOID THROTTLING - # } - - # provisioned_config = { - # auto_scale = { - # max = 2 - # min = 1 # always have 1 lambda ready to go - # trigger_percent = 20 - # scale_in_cooldown_seconds = 60 - # scale_out_cooldown_seconds = 60 - # } - - # reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting - # } + reserved_concurrency = 10 # limit the amount of concurrent executions to avoid throttling, but allow some bursting + } } resource "aws_apigatewayv2_api" "http_api" { @@ -74,3 +71,62 @@ resource "aws_lambda_permission" "allow_invoke" { principal = "apigateway.amazonaws.com" source_arn = "${aws_apigatewayv2_api.http_api.execution_arn}/*/*" # all routes/stages } + +resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { + alarm_name = local.api_5xx_alarm_name + alarm_description = "HTTP API (v2) 5xx error rate > ${var.api_5xx_alarm_threshold}% for ${var.api_5xx_alarm_evaluation_periods} minute(s) ${var.api_5xx_alarm_datapoints_to_alarm} times" + actions_enabled = true + + comparison_operator = "GreaterThanThreshold" + threshold = var.api_5xx_alarm_threshold # This is the value your metric is compared against + evaluation_periods = var.api_5xx_alarm_evaluation_periods # This is how many consecutive periods CloudWatch looks at when deciding the alarm state. + datapoints_to_alarm = var.api_5xx_alarm_datapoints_to_alarm # This is how many of those evaluated periods must be breaching to trigger ALARM. + treat_missing_data = "notBreaching" + + # + # Metric math: (5xx / count) * 100 + # Guarded to avoid NaN/Inf when count is 0 or very low + # + metric_query { + id = "e" + label = "5xxErrorRate" + return_data = true + expression = "IF(mcount < 1, 0, (m5xx / mcount) * 100)" + } + + # + # API Gateway v2 – 5XX errors + # + metric_query { + id = "m5xx" + metric { + namespace = "AWS/ApiGateway" + metric_name = local.apigw_http_5xx_metric + stat = "Sum" + period = 60 + + dimensions = { + ApiId = aws_apigatewayv2_api.http_api.id + Stage = aws_apigatewayv2_stage.default.name + } + } + } + + # + # API Gateway v2 – total request count + # + metric_query { + id = "mcount" + metric { + namespace = "AWS/ApiGateway" + metric_name = "Count" + stat = "Sum" + period = 60 + + dimensions = { + ApiId = aws_apigatewayv2_api.http_api.id + Stage = aws_apigatewayv2_stage.default.name + } + } + } +} diff --git a/infra/modules/aws/api/variables.tf b/infra/modules/aws/api/variables.tf index 6088d0c3..a75acdf9 100644 --- a/infra/modules/aws/api/variables.tf +++ b/infra/modules/aws/api/variables.tf @@ -14,3 +14,18 @@ variable "lambda_bucket" { description = "Lambda bucket where the code zip(s) are uploaded to" } ### end of static vars set in root.hcl ### + +variable "api_5xx_alarm_threshold" { + type = number + description = "The threshold for the API 5xx error rate alarm" +} + +variable "api_5xx_alarm_evaluation_periods" { + type = number + description = "The number of consecutive periods CloudWatch looks at when deciding the alarm state" +} + +variable "api_5xx_alarm_datapoints_to_alarm" { + type = number + description = "The number of evaluated periods that must be breaching to trigger ALARM" +} \ No newline at end of file diff --git a/infra/modules/aws/consumer/local.tf b/infra/modules/aws/consumer/local.tf new file mode 100644 index 00000000..2b78c191 --- /dev/null +++ b/infra/modules/aws/consumer/local.tf @@ -0,0 +1,4 @@ +locals { + sqs_chunk_size = 5 + lambda_name = "${var.environment}-${var.project_name}-consumer" +} \ No newline at end of file diff --git a/infra/modules/aws/consumer/main.tf b/infra/modules/aws/consumer/main.tf index 9a4fb84e..13e1458d 100644 --- a/infra/modules/aws/consumer/main.tf +++ b/infra/modules/aws/consumer/main.tf @@ -5,7 +5,12 @@ module "lambda_consumer" { environment = var.environment lambda_bucket = var.lambda_bucket - lambda_name = "consumer" + lambda_name = local.lambda_name + + environment_variables = { + DEBUG_DELAY_MS = 500 + CHUNK_SIZE = local.sqs_chunk_size + } additional_policy_arns = [ module.sqs_queue.sqs_queue_read_policy_arn @@ -16,26 +21,19 @@ module "lambda_consumer" { } provisioned_config = { - fixed = 0 # cold starts only + sqs_scale = { + min = 1 + max = 5 + visible_messages = 10 + queue_name = module.sqs_queue.sqs_queue_name + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } } - - # provisioned_config = { - # fixed = 1 # always have 1 lambda ready to go - # reserved_concurrency = 2 # only allow 2 concurrent executions THIS ALSO SERVES AS A LIMIT TO AVOID THROTTLING - # } - - # provisioned_config = { - # sqs_scale = { - # min = 1 - # max = 5 - # visible_messages = 100 - # queue_name = module.sqs_queue.sqs_queue_name - # scale_in_cooldown_seconds = 60 - # scale_out_cooldown_seconds = 60 - # } - # } } +# configure a deadletter queue (DLQ) for the SQS queue used by the Lambda consumer + module "sqs_queue" { source = "../_shared/sqs" @@ -46,8 +44,8 @@ resource "aws_lambda_event_source_mapping" "sqs" { event_source_arn = module.sqs_queue.sqs_queue_arn function_name = module.lambda_consumer.function_name - batch_size = 500 + batch_size = local.sqs_chunk_size maximum_batching_window_in_seconds = 10 function_response_types = ["ReportBatchItemFailures"] -} \ No newline at end of file +} diff --git a/infra/modules/aws/consumer/outputs.tf b/infra/modules/aws/consumer/outputs.tf index 21836daf..c9514ec4 100644 --- a/infra/modules/aws/consumer/outputs.tf +++ b/infra/modules/aws/consumer/outputs.tf @@ -13,3 +13,7 @@ output "lambda_function_name" { output "lambda_alias_name" { value = module.lambda_consumer.alias_name } + +output "sqs_queue_url" { + value = module.sqs_queue.sqs_queue_url +} diff --git a/justfile b/justfile index 87c7bbb1..c387ab19 100644 --- a/justfile +++ b/justfile @@ -270,6 +270,11 @@ lambda-deploy: #!/usr/bin/env bash set -euo pipefail + if [[ -z "$FUNCTION_NAME" ]]; then + echo "❌ FUNCTION_NAME environment variable is not set." + exit 1 + fi + if [[ -z "$APP_SPEC_KEY" ]]; then echo "❌ APP_SPEC_KEY environment variable is not set." exit 1 @@ -372,37 +377,24 @@ lambda-prune: aws lambda delete-function --function-name "$FUNCTION_NAME" --qualifier "$v" --region "$AWS_REGION" done -watch-lambda-autoscale: + +test-api-deploy-500s: #!/usr/bin/env bash set -euo pipefail - DURATION=180 - CONCURRENCY=20 - URL=https://slt6v1u8n4.execute-api.eu-west-2.amazonaws.com - - END_TIME=$(( $(date +%s) + $DURATION )) - - echo "🚀 Lambda autoscaling test" - echo " URL: $URL" - echo " Duration: $DURATION seconds" - echo " Concurrency: $CONCURRENCY" - echo - - while [[ $(date +%s) -lt "$END_TIME" ]]; do - seq 1 $CONCURRENCY \ - | xargs -n1 -P $CONCURRENCY -I{} \ - curl -s "$URL/" \ - | jq -r '.env_id' - done \ - | sort \ - | uniq -c - - echo - echo "🧊 Distinct Lambda environments:" - while [[ $(date +%s) -lt "$END_TIME" ]]; do - seq 1 $CONCURRENCY \ - | xargs -n1 -P $CONCURRENCY -I{} \ - curl -s "$URL/" \ - | jq -r '.env_id' - done \ - | sort \ - | uniq + + if [[ -z "$API_URL" ]]; then + echo "❌ API_URL environment variable is not set." + exit 1 + fi + + echo "Sending requests to $API_URL to trigger 500 errors..." + + END=$((SECONDS+180)) + + while [ $SECONDS -lt $END ]; do + curl -s -o /dev/null "$API_URL/error" + done + + echo "Finished sending requests." + + diff --git a/lambdas/api/lambda_handler.py b/lambdas/api/lambda_handler.py index 65528d54..ba6b54e5 100644 --- a/lambdas/api/lambda_handler.py +++ b/lambdas/api/lambda_handler.py @@ -3,12 +3,12 @@ import uuid import time -# Runs once per execution environment (cold start) ENV_ID = str(uuid.uuid4())[:8] BOOT_TIME_MS = int(time.time() * 1000) DEBUG_DELAY_MS = int(os.getenv("DEBUG_DELAY_MS", "0")) + def lambda_handler(event, context): print("Received event:", json.dumps(event)) @@ -16,6 +16,24 @@ def lambda_handler(event, context): if DEBUG_DELAY_MS > 0: time.sleep(DEBUG_DELAY_MS / 1000.0) + # --- Error endpoint: /fail or /error returns 500 --- + path = event.get("rawPath") or event.get("path") or "" + if path in ("/fail", "/error", "/health/fail"): + error_body = { + "message": "Forced failure for testing", + "env_id": ENV_ID, + "request_id": context.aws_request_id, + } + return { + "statusCode": 500, + "headers": { + "Content-Type": "application/json", + "X-Env-Id": ENV_ID, + }, + "body": json.dumps(error_body), + } + + # Normal success response body = { "message": "Hello from Lambda!", "env_id": ENV_ID, diff --git a/lambdas/consumer/lambda_handler.py b/lambdas/consumer/lambda_handler.py index 163a1c5a..6b80fcfa 100644 --- a/lambdas/consumer/lambda_handler.py +++ b/lambdas/consumer/lambda_handler.py @@ -1,8 +1,10 @@ import json +import os +from time import time from typing import List, Dict -CHUNK_SIZE = 50 - +CHUNK_SIZE = 5 +DEBUG_DELAY_MS = int(os.getenv("DEBUG_DELAY_MS", "0")) def chunk(items: List[Dict], size: int): """Yield successive chunks from a list.""" @@ -37,6 +39,9 @@ def process_chunk(records: List[Dict]) -> List[str]: for record in records: try: process_message(record) + # Optional delay to force concurrency during testing + if DEBUG_DELAY_MS > 0: + time.sleep(DEBUG_DELAY_MS / 1000.0) except Exception as exc: print(f"Failed processing message {record['messageId']}: {exc}") failed_message_ids.append(record["messageId"])