Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
f445fc1
fix(trigger): restrict env var expansion to INTERLOCK_ prefix
dwsmith1983 Mar 8, 2026
357f0ac
fix(lambda): release trigger lock on SFN start failure
dwsmith1983 Mar 8, 2026
392ffb6
fix(envcheck): add EVENTS_TABLE and EVENTS_TTL_DAYS to alert-dispatcher
dwsmith1983 Mar 8, 2026
bb15c5b
fix(lambda): log WriteJobEvent and audit write errors
dwsmith1983 Mar 8, 2026
67d4e67
refactor(lambda): replace time.Now() with d.now() for test injection
dwsmith1983 Mar 8, 2026
f3ad1de
fix(store): deep copy pipeline configs in cache via JSON round-trip
dwsmith1983 Mar 8, 2026
4237965
feat(terraform): add CloudWatch alarms for Lambda, SFN, DLQ, streams
dwsmith1983 Mar 8, 2026
5f8fe12
feat(terraform): add Lambda concurrency limits and trigger IAM policy
dwsmith1983 Mar 8, 2026
1232842
feat(terraform): route CW alarm state changes through event pipeline
dwsmith1983 Mar 8, 2026
3f0877f
feat(lambda): support Secrets Manager for Slack bot token
dwsmith1983 Mar 8, 2026
0e52029
fix(terraform): address security review findings
dwsmith1983 Mar 8, 2026
fa08abc
fix(alert-dispatcher): validate Slack token format from Secrets Manager
dwsmith1983 Mar 8, 2026
770bbc8
fix(lambda): reuse captured now and fix test date-boundary race
dwsmith1983 Mar 8, 2026
c358c2f
fix(alert-dispatcher): fail loudly on nil SecretString from Secrets M…
dwsmith1983 Mar 8, 2026
8b3c3ee
chore: go mod tidy
dwsmith1983 Mar 8, 2026
5781bce
fix(lambda): capture d.now() once before rule evaluation
dwsmith1983 Mar 8, 2026
64db571
fix(alert-dispatcher): fail-fast when no Slack token is configured
dwsmith1983 Mar 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions cmd/lambda/alert-dispatcher/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ import (
"net/http"
"os"
"strconv"
"strings"
"time"

"github.com/aws/aws-lambda-go/events"
"github.com/aws/aws-lambda-go/lambda"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/dynamodb"
"github.com/aws/aws-sdk-go-v2/service/secretsmanager"

ilambda "github.com/dwsmith1983/interlock/internal/lambda"
"github.com/dwsmith1983/interlock/internal/store"
Expand Down Expand Up @@ -52,6 +54,32 @@ func main() {
Logger: logger,
}

// Override Slack token from Secrets Manager when configured.
if secretARN := os.Getenv("SLACK_SECRET_ARN"); secretARN != "" {
smClient := secretsmanager.NewFromConfig(cfg)
out, err := smClient.GetSecretValue(context.Background(), &secretsmanager.GetSecretValueInput{
SecretId: &secretARN,
})
if err != nil {
logger.Error("failed to read Slack secret from Secrets Manager", "arn", secretARN, "error", err)
os.Exit(1)
}
if out.SecretString == nil {
logger.Error("Secrets Manager returned nil SecretString", "arn", secretARN)
os.Exit(1)
}
token := strings.TrimSpace(*out.SecretString)
if !strings.HasPrefix(token, "xoxb-") && !strings.HasPrefix(token, "xoxe-") {
logger.Warn("SLACK_SECRET_ARN value does not look like a Slack bot token (expected xoxb-/xoxe- prefix)")
}
deps.SlackBotToken = token
}

if deps.SlackBotToken == "" {
logger.Error("no Slack token configured: set SLACK_BOT_TOKEN or SLACK_SECRET_ARN")
os.Exit(1)
}

lambda.Start(func(ctx context.Context, sqsEvent events.SQSEvent) (events.SQSEventResponse, error) {
return ilambda.HandleAlertDispatcher(ctx, deps, sqsEvent)
})
Expand Down
5 changes: 4 additions & 1 deletion deploy/terraform/alerting.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ resource "aws_sqs_queue_policy" "alert" {
Resource = aws_sqs_queue.alert.arn
Condition = {
ArnEquals = {
"aws:SourceArn" = aws_cloudwatch_event_rule.alert_events.arn
"aws:SourceArn" = [
aws_cloudwatch_event_rule.alert_events.arn,
aws_cloudwatch_event_rule.cw_alarm_alert.arn,
]
}
}
}]
Expand Down
137 changes: 137 additions & 0 deletions deploy/terraform/cloudwatch.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# -----------------------------------------------------------------------------
# CloudWatch alarms — Lambda errors, SFN failures, DLQ depth, stream lag
# -----------------------------------------------------------------------------

locals {
# Map of Terraform resource key → Lambda function resource for alarm iteration.
lambda_functions = {
stream_router = aws_lambda_function.stream_router
orchestrator = aws_lambda_function.orchestrator
sla_monitor = aws_lambda_function.sla_monitor
watchdog = aws_lambda_function.watchdog
event_sink = aws_lambda_function.event_sink
alert_dispatcher = aws_lambda_function.alert_dispatcher
}

# DLQ resources keyed by a short label.
dlq_queues = {
sr_control = aws_sqs_queue.stream_router_control_dlq
sr_joblog = aws_sqs_queue.stream_router_joblog_dlq
alert = aws_sqs_queue.alert_dlq
}

# DynamoDB stream event source mappings keyed by table name.
stream_mappings = {
control = aws_lambda_event_source_mapping.control_stream
joblog = aws_lambda_event_source_mapping.joblog_stream
}

alarm_actions = var.sns_alarm_topic_arn != "" ? [var.sns_alarm_topic_arn] : []
}

# =============================================================================
# 1. Lambda Error Alarms — one per function
# =============================================================================

resource "aws_cloudwatch_metric_alarm" "lambda_errors" {
for_each = local.lambda_functions

alarm_name = "${var.environment}-interlock-${each.key}-errors"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "Errors"
namespace = "AWS/Lambda"
period = 300
statistic = "Sum"
threshold = 1
alarm_description = "${each.key} Lambda errors detected"
treat_missing_data = "notBreaching"

dimensions = {
FunctionName = each.value.function_name
}

alarm_actions = local.alarm_actions
ok_actions = local.alarm_actions
tags = var.tags
}

# =============================================================================
# 2. Step Functions Execution Failure Alarm
# =============================================================================

resource "aws_cloudwatch_metric_alarm" "sfn_failures" {
alarm_name = "${var.environment}-interlock-sfn-failures"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "ExecutionsFailed"
namespace = "AWS/States"
period = 300
statistic = "Sum"
threshold = 1
alarm_description = "Step Functions pipeline execution failures detected"
treat_missing_data = "notBreaching"

dimensions = {
StateMachineArn = aws_sfn_state_machine.pipeline.arn
}

alarm_actions = local.alarm_actions
ok_actions = local.alarm_actions
tags = var.tags
}

# =============================================================================
# 3. DLQ Message Count Alarms — fires when any message lands in a DLQ
# =============================================================================

resource "aws_cloudwatch_metric_alarm" "dlq_messages" {
for_each = local.dlq_queues

alarm_name = "${var.environment}-interlock-dlq-${each.key}-depth"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "ApproximateNumberOfMessagesVisible"
namespace = "AWS/SQS"
period = 300
statistic = "Sum"
threshold = 1
alarm_description = "Messages visible in ${each.key} dead-letter queue"
treat_missing_data = "notBreaching"

dimensions = {
QueueName = each.value.name
}

alarm_actions = local.alarm_actions
ok_actions = local.alarm_actions
tags = var.tags
}

# =============================================================================
# 4. DynamoDB Stream Iterator Age Alarms — detects stream processing lag
# =============================================================================

resource "aws_cloudwatch_metric_alarm" "stream_iterator_age" {
for_each = local.stream_mappings

alarm_name = "${var.environment}-interlock-stream-${each.key}-iterator-age"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "IteratorAge"
namespace = "AWS/Lambda"
period = 300
statistic = "Maximum"
threshold = 300000 # 5 minutes in milliseconds
alarm_description = "DynamoDB ${each.key} stream iterator age exceeds 5 minutes"
treat_missing_data = "notBreaching"

dimensions = {
FunctionName = aws_lambda_function.stream_router.function_name
EventSourceMapping = each.value.uuid
}

alarm_actions = local.alarm_actions
ok_actions = local.alarm_actions
tags = var.tags
}
111 changes: 110 additions & 1 deletion deploy/terraform/eventbridge.tf
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ resource "aws_cloudwatch_event_rule" "alert_events" {
tags = var.tags

event_pattern = jsonencode({
source = ["interlock"]
source = ["interlock"]
detail-type = [
"SLA_WARNING",
"SLA_BREACH",
Expand All @@ -80,6 +80,15 @@ resource "aws_cloudwatch_event_rule" "alert_events" {
"SCHEDULE_MISSED",
"DATA_DRIFT",
"JOB_POLL_EXHAUSTED",
"POST_RUN_DRIFT",
"POST_RUN_DRIFT_INFLIGHT",
"POST_RUN_FAILED",
"POST_RUN_SENSOR_MISSING",
"RERUN_REJECTED",
"LATE_DATA_ARRIVAL",
"TRIGGER_RECOVERED",
"BASELINE_CAPTURE_FAILED",
"PIPELINE_EXCLUDED",
]
})
}
Expand All @@ -90,3 +99,103 @@ resource "aws_cloudwatch_event_target" "alert_sqs" {
target_id = "alert-sqs"
arn = aws_sqs_queue.alert.arn
}

# -----------------------------------------------------------------------------
# CloudWatch Alarm state changes → event pipeline (default bus)
#
# CW alarms automatically publish state-change events to the default bus.
# Input transformers reshape them into InterlockEvent format so event-sink
# and alert-dispatcher handle them natively — no Go code changes needed.
# -----------------------------------------------------------------------------

# Rule: ALL alarm state changes → event-sink (logging)
resource "aws_cloudwatch_event_rule" "cw_alarm_log" {
name = "${var.environment}-interlock-cw-alarm-log"
description = "Route interlock CloudWatch alarm state changes to event-sink"
event_bus_name = "default"
tags = var.tags

event_pattern = jsonencode({
source = ["aws.cloudwatch"]
detail-type = ["CloudWatch Alarm State Change"]
detail = {
alarmName = [{ prefix = "${var.environment}-interlock-" }]
}
})
}

resource "aws_cloudwatch_event_target" "cw_alarm_event_sink" {
rule = aws_cloudwatch_event_rule.cw_alarm_log.name
target_id = "cw-alarm-event-sink"
arn = aws_lambda_function.event_sink.arn

input_transformer {
input_paths = {
alarmName = "$.detail.alarmName"
state = "$.detail.state.value"
reason = "$.detail.state.reason"
}

input_template = <<-EOT
{
"source": "interlock",
"detail-type": "INFRA_ALARM",
"detail": {
"pipelineId": "INFRASTRUCTURE",
"message": "<alarmName>: <state> — <reason>"
}
}
EOT
}
}

resource "aws_lambda_permission" "event_sink_cw_alarm" {
statement_id = "AllowCWAlarmEventBridgeInvoke"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.event_sink.function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.cw_alarm_log.arn
}

# Rule: ALARM state only → SQS alert queue (Slack)
resource "aws_cloudwatch_event_rule" "cw_alarm_alert" {
name = "${var.environment}-interlock-cw-alarm-alert"
description = "Route interlock CloudWatch ALARM transitions to Slack"
event_bus_name = "default"
tags = var.tags

event_pattern = jsonencode({
source = ["aws.cloudwatch"]
detail-type = ["CloudWatch Alarm State Change"]
detail = {
alarmName = [{ prefix = "${var.environment}-interlock-" }]
state = {
value = ["ALARM"]
}
}
})
}

resource "aws_cloudwatch_event_target" "cw_alarm_sqs" {
rule = aws_cloudwatch_event_rule.cw_alarm_alert.name
target_id = "cw-alarm-alert-sqs"
arn = aws_sqs_queue.alert.arn

input_transformer {
input_paths = {
alarmName = "$.detail.alarmName"
reason = "$.detail.state.reason"
}

input_template = <<-EOT
{
"source": "interlock",
"detail-type": "INFRA_ALARM",
"detail": {
"pipelineId": "INFRASTRUCTURE",
"message": "<alarmName>: ALARM — <reason>"
}
}
EOT
}
}
Loading