From b06a8862a46f28924773d501cb1ed977af66e6ac Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 22:46:23 +0700 Subject: [PATCH 01/17] fix: validation mode case-insensitive matching (BUG-8) EvaluateRules now uses strings.ToUpper(mode) so "any", "Any", and "ANY" all route to the ANY branch. Previously lowercase variants fell through to the default ALL case. --- internal/validation/engine.go | 3 ++- internal/validation/engine_test.go | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/internal/validation/engine.go b/internal/validation/engine.go index 0979e1b..b693aca 100644 --- a/internal/validation/engine.go +++ b/internal/validation/engine.go @@ -4,6 +4,7 @@ package validation import ( "fmt" "strconv" + "strings" "time" "github.com/dwsmith1983/interlock/pkg/types" @@ -38,7 +39,7 @@ func EvaluateRules(mode string, rules []types.ValidationRule, sensors map[string } var passed bool - switch mode { + switch strings.ToUpper(mode) { case "ANY": passed = passCount > 0 default: // "ALL" diff --git a/internal/validation/engine_test.go b/internal/validation/engine_test.go index 2d1f7cb..1791321 100644 --- a/internal/validation/engine_test.go +++ b/internal/validation/engine_test.go @@ -448,6 +448,36 @@ func TestToFloat64_Unsupported(t *testing.T) { assert.False(t, ok) } +// --- BUG-8 characterization: lowercase "any" treated as "ALL" --- + +func TestEvaluateRules_LowercaseAny_TreatedAsAll(t *testing.T) { + // BUG-8 FIXED: lowercase "any" now works via strings.ToUpper. + rules := []types.ValidationRule{ + {Key: "s1", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + {Key: "s2", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + } + sensors := map[string]map[string]interface{}{ + "s1": {"count": float64(20)}, // passes + "s2": {"count": float64(5)}, // fails + } + result := EvaluateRules("any", rules, sensors, time.Now()) + assert.True(t, result.Passed, "BUG-8 FIXED: lowercase 'any' now works") +} + +func TestEvaluateRules_MixedCaseAny_TreatedAsAll(t *testing.T) { + // BUG-8 FIXED: mixed-case "Any" now works via strings.ToUpper. + rules := []types.ValidationRule{ + {Key: "s1", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + {Key: "s2", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + } + sensors := map[string]map[string]interface{}{ + "s1": {"count": float64(20)}, + "s2": {"count": float64(5)}, + } + result := EvaluateRules("Any", rules, sensors, time.Now()) + assert.True(t, result.Passed, "BUG-8 FIXED: lowercase 'any' now works") +} + // --- EvaluateRules default mode (not "ALL" or "ANY") --- func TestEvaluateRules_DefaultMode_FallsToALL(t *testing.T) { From a4dd4d64b7fd8d401992be69444a59941f6e5a69 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 22:46:30 +0700 Subject: [PATCH 02/17] security: harden IAM policies and EventBridge bus access (SEC-1,2,4,5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - lambda_trigger_arns default [] with precondition (SEC-1) - Slack plaintext token deprecation warning via check block (SEC-2) - New variables for trigger IAM scoping: glue_job_arns, emr_cluster_arns, emr_serverless_app_arns, sfn_trigger_arns — all default [] (SEC-4) - EventBridge bus policy restricts PutEvents to Lambda roles (SEC-5) --- deploy/terraform/eventbridge.tf | 19 ++++++++++++ deploy/terraform/lambda.tf | 54 ++++++++++++++++++++++++++++++--- deploy/terraform/variables.tf | 26 +++++++++++++++- 3 files changed, 94 insertions(+), 5 deletions(-) diff --git a/deploy/terraform/eventbridge.tf b/deploy/terraform/eventbridge.tf index c55a45a..ba817b4 100644 --- a/deploy/terraform/eventbridge.tf +++ b/deploy/terraform/eventbridge.tf @@ -3,6 +3,25 @@ resource "aws_cloudwatch_event_bus" "interlock" { tags = var.tags } +resource "aws_cloudwatch_event_bus_policy" "interlock_bus" { + event_bus_name = aws_cloudwatch_event_bus.interlock.name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowInterlockLambdas" + Effect = "Allow" + Principal = { + AWS = [for name in local.lambda_names : aws_iam_role.lambda[name].arn] + } + Action = "events:PutEvents" + Resource = aws_cloudwatch_event_bus.interlock.arn + } + ] + }) +} + # Watchdog schedule resource "aws_cloudwatch_event_rule" "watchdog" { name = "${var.environment}-interlock-watchdog" diff --git a/deploy/terraform/lambda.tf b/deploy/terraform/lambda.tf index 0d636c1..d3cf012 100644 --- a/deploy/terraform/lambda.tf +++ b/deploy/terraform/lambda.tf @@ -541,6 +541,17 @@ resource "aws_lambda_event_source_mapping" "joblog_stream" { } } +# ============================================================================= +# Security checks +# ============================================================================= + +check "slack_token_deprecation" { + assert { + condition = var.slack_bot_token == "" || var.slack_secret_arn != "" + error_message = "DEPRECATED: Passing a plaintext Slack bot token is deprecated. Use var.slack_secret_arn with an AWS Secrets Manager ARN instead. Plaintext path still works but will be removed in a future version." + } +} + # ============================================================================= # Conditional trigger permissions for orchestrator (opt-in per trigger type) # ============================================================================= @@ -552,13 +563,20 @@ resource "aws_iam_role_policy" "glue_trigger" { name = "glue-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.glue_job_arns) > 0 + error_message = "glue_job_arns must be non-empty when enable_glue_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [ { Effect = "Allow" Action = ["glue:StartJobRun", "glue:GetJobRun"] - Resource = "*" + Resource = var.glue_job_arns }, { Sid = "GlueLogVerification" @@ -580,12 +598,19 @@ resource "aws_iam_role_policy" "emr_trigger" { name = "emr-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.emr_cluster_arns) > 0 + error_message = "emr_cluster_arns must be non-empty when enable_emr_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ Effect = "Allow" Action = ["elasticmapreduce:AddJobFlowSteps", "elasticmapreduce:DescribeStep"] - Resource = "*" + Resource = var.emr_cluster_arns }] }) } @@ -597,12 +622,19 @@ resource "aws_iam_role_policy" "emr_serverless_trigger" { name = "emr-serverless-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.emr_serverless_app_arns) > 0 + error_message = "emr_serverless_app_arns must be non-empty when enable_emr_serverless_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ Effect = "Allow" Action = ["emr-serverless:StartJobRun", "emr-serverless:GetJobRun"] - Resource = "*" + Resource = var.emr_serverless_app_arns }] }) } @@ -614,12 +646,19 @@ resource "aws_iam_role_policy" "sfn_trigger" { name = "sfn-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.sfn_trigger_arns) > 0 + error_message = "sfn_trigger_arns must be non-empty when enable_sfn_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ Effect = "Allow" Action = ["states:StartExecution", "states:DescribeExecution"] - Resource = "*" + Resource = var.sfn_trigger_arns }] }) } @@ -631,6 +670,13 @@ resource "aws_iam_role_policy" "lambda_trigger" { name = "lambda-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.lambda_trigger_arns) > 0 + error_message = "lambda_trigger_arns must be non-empty when enable_lambda_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ diff --git a/deploy/terraform/variables.tf b/deploy/terraform/variables.tf index 8853cac..eb7615a 100644 --- a/deploy/terraform/variables.tf +++ b/deploy/terraform/variables.tf @@ -139,5 +139,29 @@ variable "enable_lambda_trigger" { variable "lambda_trigger_arns" { description = "ARNs of Lambda functions the orchestrator may invoke as pipeline triggers" type = list(string) - default = ["*"] + default = [] +} + +variable "glue_job_arns" { + description = "ARNs of Glue jobs that the orchestrator Lambda can start. Required when enable_glue_trigger is true." + type = list(string) + default = [] +} + +variable "emr_cluster_arns" { + description = "ARNs of EMR clusters the orchestrator can submit steps to. Required when enable_emr_trigger is true." + type = list(string) + default = [] +} + +variable "emr_serverless_app_arns" { + description = "ARNs of EMR Serverless applications. Required when enable_emr_serverless_trigger is true." + type = list(string) + default = [] +} + +variable "sfn_trigger_arns" { + description = "ARNs of Step Functions the orchestrator can start. Required when enable_sfn_trigger is true." + type = list(string) + default = [] } From 6b9b0e99a1dea9c5e34cd156fc8c70ce8656e08d Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 22:46:35 +0700 Subject: [PATCH 03/17] fix: suppress SLA_MET when pipeline never ran and log reconcile errors (BUG-5, CQ-5) BUG-5: handleSLACancel now checks for trigger existence before publishing SLA verdict. Pipelines that were never triggered no longer emit false SLA_MET events. CQ-5: Replaced _ = publishEvent(...) with error-logged calls in SLA reconcile path. --- internal/lambda/sla_monitor.go | 40 +++++++++++++++++++++-------- internal/lambda/sla_monitor_test.go | 40 +++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/internal/lambda/sla_monitor.go b/internal/lambda/sla_monitor.go index e4164c8..0131b2f 100644 --- a/internal/lambda/sla_monitor.go +++ b/internal/lambda/sla_monitor.go @@ -342,17 +342,31 @@ func handleSLACancel(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMo } } - // Always publish the verdict. For first runs, Scheduler entries would have - // fired WARNING/BREACH but are now deleted — MET is the only new signal. - // For reruns, the Scheduler entries were already deleted by the first run's - // cancel, so this publish is the only path to a notification. + // Only publish a verdict if the pipeline was actually triggered. + // If no trigger record exists, the pipeline never ran — publishing SLA_MET + // would be misleading since the SLA wasn't "met" (nothing executed). + publish := true + if d.Store != nil { + tr, err := d.Store.GetTrigger(ctx, input.PipelineID, input.ScheduleID, input.Date) + if err != nil { + d.Logger.WarnContext(ctx, "trigger lookup failed in cancel, proceeding with verdict", + "pipeline", input.PipelineID, "error", err) + } else if tr == nil { + d.Logger.InfoContext(ctx, "skipping SLA verdict — pipeline was never triggered", + "pipeline", input.PipelineID, "date", input.Date, "alertType", alertType) + publish = false + } + } + d.Logger.InfoContext(ctx, "cancelled SLA schedules", "pipeline", input.PipelineID, "alertType", alertType, ) - if err := publishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date, - fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil { - return SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err) + if publish { + if err := publishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil { + return SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err) + } } return SLAMonitorOutput{ @@ -423,13 +437,17 @@ func handleSLAReconcile(ctx context.Context, d *Deps, input SLAMonitorInput) (SL var alertType string switch { case now.After(breachAt) || now.Equal(breachAt): - _ = publishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date, - fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail) + if err := publishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_BREACH", "error", err) + } alertType = "SLA_BREACH" case now.After(warningAt) || now.Equal(warningAt): // Past warning but before breach — fire warning only - _ = publishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date, - fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail) + if err := publishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_WARNING", "error", err) + } alertType = "SLA_WARNING" default: alertType = "SLA_MET" diff --git a/internal/lambda/sla_monitor_test.go b/internal/lambda/sla_monitor_test.go index c8228c1..10c969c 100644 --- a/internal/lambda/sla_monitor_test.go +++ b/internal/lambda/sla_monitor_test.go @@ -520,6 +520,46 @@ func TestSLAMonitor_Cancel_RecalculatesWhenTimesNotProvided(t *testing.T) { } } +// --------------------------------------------------------------------------- +// BUG-5 characterization: SLA_MET published when pipeline never ran +// --------------------------------------------------------------------------- + +func TestSLAMonitor_Cancel_NeverTriggered_PublishesMet(t *testing.T) { + // BUG-5 characterization: SLA_MET fires even with no trigger/job records. + // Pipeline was never started — there should be no SLA verdict at all. + sched := &mockScheduler{} + eb := &mockEventBridge{} + mock := newMockDDB() + s := &store.Store{ + Client: mock, + ControlTable: testControlTable, + JobLogTable: "joblog", + RerunTable: "rerun", + } + d := &lambda.Deps{ + Store: s, + Scheduler: sched, + SchedulerGroupName: "interlock-sla", + EventBridge: eb, + EventBusName: "test-bus", + Logger: slog.Default(), + } + + // No trigger, no joblog — pipeline was never started + out, err := lambda.HandleSLAMonitor(context.Background(), d, lambda.SLAMonitorInput{ + Mode: "cancel", + PipelineID: "never-ran", + ScheduleID: "daily", + Date: "2026-03-13", + WarningAt: "2099-12-31T23:45:00Z", + BreachAt: "2099-12-31T23:59:00Z", + }) + require.NoError(t, err) + // BUG-5 fixed: AlertType still set for SFN flow, but no EventBridge event published + assert.Equal(t, "SLA_MET", out.AlertType, "AlertType should still be set for SFN state machine") + assert.Empty(t, eb.events, "no EventBridge events should be published when pipeline was never triggered") +} + // --------------------------------------------------------------------------- // Fire-alert tests // --------------------------------------------------------------------------- From 59875c4c589ff175587897eb8a6434f0c8e50381 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 22:46:52 +0700 Subject: [PATCH 04/17] fix: trigger deadline uses schedule timezone with SLA fallback (BUG-6) closeSensorTriggerWindow now reads timezone from cfg.Schedule.Timezone (the schedule's own timezone) instead of cfg.SLA.Timezone. Falls back to SLA timezone if schedule timezone is not set. Prevents incorrect deadline calculation when schedule and SLA use different timezones. --- internal/lambda/export_test.go | 10 +- internal/lambda/watchdog.go | 4 +- internal/lambda/watchdog_test.go | 170 ++++++++++++++++++++++++++++++- 3 files changed, 177 insertions(+), 7 deletions(-) diff --git a/internal/lambda/export_test.go b/internal/lambda/export_test.go index 738c420..3c5f829 100644 --- a/internal/lambda/export_test.go +++ b/internal/lambda/export_test.go @@ -3,8 +3,16 @@ // even to files in the non-_test package when placed here). package lambda -import "github.com/dwsmith1983/interlock/pkg/types" +import ( + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) // IsExcludedDate re-exports isExcludedDate for white-box unit testing from // the external test package (package lambda_test). var IsExcludedDate func(cfg *types.PipelineConfig, dateStr string) bool = isExcludedDate + +// ResolveTriggerDeadlineTime re-exports resolveTriggerDeadlineTime for +// white-box unit testing from the external test package (package lambda_test). +var ResolveTriggerDeadlineTime func(deadline, date, timezone string) time.Time = resolveTriggerDeadlineTime diff --git a/internal/lambda/watchdog.go b/internal/lambda/watchdog.go index f6b65cf..7bf5605 100644 --- a/internal/lambda/watchdog.go +++ b/internal/lambda/watchdog.go @@ -721,8 +721,8 @@ func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, schedule // Compute the absolute trigger deadline time directly — we do NOT use // handleSLACalculate here because it rolls daily deadlines forward 24h // when past, which defeats the purpose of checking for expiry. - tz := "" - if cfg.SLA != nil { + tz := cfg.Schedule.Timezone + if tz == "" && cfg.SLA != nil { tz = cfg.SLA.Timezone } triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz) diff --git a/internal/lambda/watchdog_test.go b/internal/lambda/watchdog_test.go index baad19c..bc77b82 100644 --- a/internal/lambda/watchdog_test.go +++ b/internal/lambda/watchdog_test.go @@ -1875,9 +1875,9 @@ func TestWatchdog_PostRunSensorMissing(t *testing.T) { // Seed COMPLETED trigger for today. seedTriggerWithStatus(mock, "gold-revenue", today, types.TriggerStatusCompleted) - // Seed baseline (written at completion time). + // Seed baseline (written at completion time, namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+today, map[string]interface{}{ - "sensor_count": float64(100), + "quality-check": map[string]interface{}{"sensor_count": float64(100)}, }) // Seed a success job event with timestamp 3h before now (well past the 1h timeout). @@ -1925,9 +1925,9 @@ func TestWatchdog_PostRunSensorPresent(t *testing.T) { // Seed COMPLETED trigger for today. seedTriggerWithStatus(mock, "gold-revenue", today, types.TriggerStatusCompleted) - // Seed baseline. + // Seed baseline (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+today, map[string]interface{}{ - "sensor_count": float64(100), + "quality-check": map[string]interface{}{"sensor_count": float64(100)}, }) // Seed a success job event 3h before now. @@ -3189,3 +3189,165 @@ func TestWatchdog_DryRun_SkipsAllSchedulingAndAlerts(t *testing.T) { "dry-run pipeline must not produce %s events", prohibited) } } + +// TestResolveTriggerDeadlineTime_UsesScheduleTimezone verifies that the trigger +// deadline is resolved in the schedule's timezone, not the SLA timezone. +// BUG-6: closeSensorTriggerWindow previously used cfg.SLA.Timezone exclusively, +// ignoring cfg.Schedule.Timezone. The fix prefers Schedule.Timezone with SLA as +// fallback. +func TestResolveTriggerDeadlineTime_UsesScheduleTimezone(t *testing.T) { + tests := []struct { + name string + deadline string + date string + timezone string + wantHour int + wantMin int + wantTZ string + }{ + { + name: "daily deadline in US/Eastern", + deadline: "09:00", + date: "2026-03-09", + timezone: "US/Eastern", + wantHour: 9, + wantMin: 0, + wantTZ: "EDT", + }, + { + name: "daily deadline in Europe/Berlin", + deadline: "09:00", + date: "2026-03-09", + timezone: "Europe/Berlin", + wantHour: 9, + wantMin: 0, + wantTZ: "CET", + }, + { + name: "hourly deadline in Asia/Tokyo", + deadline: ":45", + date: "2026-03-09T13", + timezone: "Asia/Tokyo", + wantHour: 14, // hour+1 for processing window + wantMin: 45, + wantTZ: "JST", + }, + { + name: "empty timezone falls back to UTC", + deadline: "09:00", + date: "2026-03-09", + timezone: "", + wantHour: 9, + wantMin: 0, + wantTZ: "UTC", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := lambda.ResolveTriggerDeadlineTime(tt.deadline, tt.date, tt.timezone) + require.False(t, got.IsZero(), "expected non-zero time") + assert.Equal(t, tt.wantHour, got.Hour(), "hour mismatch") + assert.Equal(t, tt.wantMin, got.Minute(), "minute mismatch") + zoneName, _ := got.Zone() + assert.Equal(t, tt.wantTZ, zoneName, "timezone mismatch") + }) + } +} + +// TestCloseSensorTriggerWindow_PrefersScheduleTimezone is an integration-level +// test verifying that closeSensorTriggerWindow resolves the trigger deadline in +// the schedule timezone rather than the SLA timezone when both are set. +// +// BUG-6: With Schedule.Timezone="US/Eastern" (UTC-4 in March) and +// SLA.Timezone="Asia/Tokyo" (UTC+9), a 09:00 trigger deadline should resolve +// to 09:00 US/Eastern (13:00 UTC), NOT 09:00 Asia/Tokyo (00:00 UTC). +func TestCloseSensorTriggerWindow_PrefersScheduleTimezone(t *testing.T) { + mock := newMockDDB() + d, _, ebMock := testDeps(mock) + schedMock := &mockScheduler{} + d.Scheduler = schedMock + d.SLAMonitorARN = "arn:aws:lambda:us-east-1:123:function:sla-monitor" + d.SchedulerRoleARN = "arn:aws:iam::123:role/scheduler-role" + d.SchedulerGroupName = "interlock-sla" + + // Fix time at 13:30 UTC on 2026-03-09. In US/Eastern (EDT, UTC-4), + // this is 09:30 — past the 09:00 trigger deadline. + // In Asia/Tokyo (JST, UTC+9), 09:00 JST = 00:00 UTC on 2026-03-09, + // so 13:30 UTC is also past 09:00 JST. + // + // The critical test: at 12:30 UTC (08:30 Eastern), the deadline should + // NOT have expired in the schedule timezone (US/Eastern), even though it + // would have expired if resolved in Asia/Tokyo. + beforeDeadlineUTC := time.Date(2026, 3, 9, 12, 30, 0, 0, time.UTC) + d.NowFunc = func() time.Time { return beforeDeadlineUTC } + d.StartedAt = beforeDeadlineUTC.Add(-5 * time.Minute) + + cfg := types.PipelineConfig{ + Pipeline: types.PipelineIdentity{ID: "tz-bug6-pipeline"}, + Schedule: types.ScheduleConfig{ + Timezone: "US/Eastern", // EDT = UTC-4 in March + Trigger: &types.TriggerCondition{ + Key: "sensor-data", + Check: "equals", + Field: "ready", + Value: true, + Deadline: "09:00", // 09:00 Eastern = 13:00 UTC + }, + Evaluation: types.EvaluationWindow{Window: "1h", Interval: "5m"}, + }, + SLA: &types.SLAConfig{ + Deadline: "10:00", + Timezone: "Asia/Tokyo", // JST = UTC+9; 09:00 JST = 00:00 UTC + }, + Validation: types.ValidationConfig{Trigger: "ALL"}, + Job: types.JobConfig{Type: "command", Config: map[string]interface{}{"command": "echo hello"}}, + } + seedConfig(mock, cfg) + + err := lambda.HandleWatchdog(context.Background(), d) + require.NoError(t, err) + + // At 12:30 UTC = 08:30 Eastern, the 09:00 Eastern deadline has NOT + // expired. No SENSOR_DEADLINE_EXPIRED event should be published. + // (Under the old buggy code that used SLA.Timezone=Asia/Tokyo, + // 09:00 JST = 00:00 UTC, so it would have considered the deadline + // expired and published the event.) + ebMock.mu.Lock() + for _, ev := range ebMock.events { + assert.NotEqual(t, string(types.EventSensorDeadlineExpired), *ev.Entries[0].DetailType, + "deadline should NOT be expired at 08:30 Eastern (12:30 UTC)") + } + ebMock.mu.Unlock() + + // Now advance to 13:30 UTC = 09:30 Eastern — past the 09:00 Eastern deadline. + afterDeadlineUTC := time.Date(2026, 3, 9, 13, 30, 0, 0, time.UTC) + d.NowFunc = func() time.Time { return afterDeadlineUTC } + d.StartedAt = afterDeadlineUTC.Add(-5 * time.Minute) + + // Reset mock state for fresh run. + mock2 := newMockDDB() + d2, _, ebMock2 := testDeps(mock2) + d2.Scheduler = &mockScheduler{} + d2.SLAMonitorARN = d.SLAMonitorARN + d2.SchedulerRoleARN = d.SchedulerRoleARN + d2.SchedulerGroupName = d.SchedulerGroupName + d2.NowFunc = func() time.Time { return afterDeadlineUTC } + d2.StartedAt = afterDeadlineUTC.Add(-5 * time.Minute) + seedConfig(mock2, cfg) + + err = lambda.HandleWatchdog(context.Background(), d2) + require.NoError(t, err) + + // At 13:30 UTC = 09:30 Eastern, the 09:00 Eastern deadline IS expired. + // SENSOR_DEADLINE_EXPIRED should be published. + ebMock2.mu.Lock() + defer ebMock2.mu.Unlock() + var found bool + for _, ev := range ebMock2.events { + if *ev.Entries[0].DetailType == string(types.EventSensorDeadlineExpired) { + found = true + break + } + } + assert.True(t, found, "expected SENSOR_DEADLINE_EXPIRED at 09:30 Eastern (13:30 UTC)") +} From 351eaa66a77817a83d95d2a2de580d1f8d06b87b Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 22:47:07 +0700 Subject: [PATCH 05/17] fix: extract shared drift detection with zero-value support (BUG-1, DRY-1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New ExtractFloatOk distinguishes absent keys from actual zero values. New DetectDrift consolidates 3 identical drift comparison sites into one shared function. Transitions like 5000→0 now correctly trigger drift detection instead of being silently skipped. --- internal/lambda/drift.go | 63 +++++++++++++++++++++++++++++++++++ internal/lambda/drift_test.go | 62 ++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 internal/lambda/drift.go create mode 100644 internal/lambda/drift_test.go diff --git a/internal/lambda/drift.go b/internal/lambda/drift.go new file mode 100644 index 0000000..7088aa6 --- /dev/null +++ b/internal/lambda/drift.go @@ -0,0 +1,63 @@ +package lambda + +import ( + "math" + "strconv" +) + +// ExtractFloatOk retrieves a numeric value from a sensor data map. +// Returns (value, true) if the key exists and is numeric, (0, false) otherwise. +// Unlike ExtractFloat, this distinguishes zero values from missing keys. +func ExtractFloatOk(data map[string]interface{}, key string) (float64, bool) { + if data == nil { + return 0, false + } + v, ok := data[key] + if !ok { + return 0, false + } + switch n := v.(type) { + case float64: + return n, true + case string: + f, err := strconv.ParseFloat(n, 64) + if err != nil { + return 0, false + } + return f, true + default: + return 0, false + } +} + +// DriftResult holds the outcome of a drift comparison. +type DriftResult struct { + Drifted bool + Previous float64 + Current float64 + Delta float64 + PrevFound bool + CurrFound bool +} + +// DetectDrift compares baseline and current sensor data for a drift field. +// Both values must be present for drift to be detected. Returns whether +// the absolute delta exceeds the threshold. +func DetectDrift(baseline, current map[string]interface{}, driftField string, threshold float64) DriftResult { + prev, prevOk := ExtractFloatOk(baseline, driftField) + curr, currOk := ExtractFloatOk(current, driftField) + + result := DriftResult{ + Previous: prev, + Current: curr, + PrevFound: prevOk, + CurrFound: currOk, + } + + if prevOk && currOk { + result.Delta = curr - prev + result.Drifted = math.Abs(result.Delta) > threshold + } + + return result +} diff --git a/internal/lambda/drift_test.go b/internal/lambda/drift_test.go new file mode 100644 index 0000000..936234e --- /dev/null +++ b/internal/lambda/drift_test.go @@ -0,0 +1,62 @@ +package lambda + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestExtractFloatOk(t *testing.T) { + tests := []struct { + name string + data map[string]interface{} + key string + wantVal float64 + wantOk bool + }{ + {"present float", map[string]interface{}{"count": float64(42)}, "count", 42, true}, + {"present zero", map[string]interface{}{"count": float64(0)}, "count", 0, true}, + {"present string", map[string]interface{}{"count": "123.5"}, "count", 123.5, true}, + {"missing key", map[string]interface{}{}, "count", 0, false}, + {"nil map", nil, "count", 0, false}, + {"wrong type", map[string]interface{}{"count": true}, "count", 0, false}, + {"invalid string", map[string]interface{}{"count": "abc"}, "count", 0, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + val, ok := ExtractFloatOk(tt.data, tt.key) + assert.Equal(t, tt.wantOk, ok) + assert.InDelta(t, tt.wantVal, val, 0.001) + }) + } +} + +func TestDetectDrift(t *testing.T) { + m := func(k string, v float64) map[string]interface{} { + return map[string]interface{}{k: v} + } + tests := []struct { + name string + baseline map[string]interface{} + current map[string]interface{} + field string + threshold float64 + wantDrift bool + }{ + {"5000→0 drifts", m("count", 5000.0), m("count", 0.0), "count", 0, true}, + {"0→5000 drifts", m("count", 0.0), m("count", 5000.0), "count", 0, true}, + {"same value no drift", m("count", 100.0), m("count", 100.0), "count", 0, false}, + {"within threshold", m("count", 100.0), m("count", 150.0), "count", 100, false}, + {"exceeds threshold", m("count", 100.0), m("count", 250.0), "count", 100, true}, + {"prev missing no drift", map[string]interface{}{}, m("count", 100.0), "count", 0, false}, + {"curr missing no drift", m("count", 100.0), map[string]interface{}{}, "count", 0, false}, + {"both missing no drift", map[string]interface{}{}, map[string]interface{}{}, "count", 0, false}, + {"negative drift", m("count", 100.0), m("count", 50.0), "count", 0, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := DetectDrift(tt.baseline, tt.current, tt.field, tt.threshold) + assert.Equal(t, tt.wantDrift, result.Drifted) + }) + } +} From 76a01c4b8d49687c4f2df3c0b1baf1aec7db2e6c Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 22:47:19 +0700 Subject: [PATCH 06/17] fix: batch item failures, baseline namespacing, rerun ordering, and epoch normalization (BUG-2,3,4,9,10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG-4: HandleStreamEvent returns DynamoDBEventResponse with failed record EventIDs for Lambda partial batch retry. BUG-10: Namespace postrun baseline by rule key to prevent field collision between rules with the same field name. Clean break — existing flat baselines self-heal on next pipeline completion. BUG-2: RemapPerPeriodSensors collects additions in staging map to avoid nondeterministic map mutation during range iteration. BUG-3: Reorder handleRerunRequest to lock-before-write, preventing orphaned rerun records when lock reset fails. BUG-9: Normalize updatedAt epoch timestamps < 1e12 to milliseconds for consistent rerun freshness comparison. --- cmd/lambda/stream-router/main.go | 3 +- internal/lambda/dryrun.go | 29 +- internal/lambda/e2e_test.go | 99 +++--- internal/lambda/orchestrator.go | 15 +- internal/lambda/orchestrator_unit_test.go | 43 +++ internal/lambda/postrun.go | 110 +++--- internal/lambda/rerun.go | 43 ++- internal/lambda/stream_router.go | 13 +- internal/lambda/stream_router_test.go | 407 +++++++++++++--------- 9 files changed, 483 insertions(+), 279 deletions(-) diff --git a/cmd/lambda/stream-router/main.go b/cmd/lambda/stream-router/main.go index 1354881..60c2430 100644 --- a/cmd/lambda/stream-router/main.go +++ b/cmd/lambda/stream-router/main.go @@ -10,6 +10,7 @@ import ( "os" "time" + "github.com/aws/aws-lambda-go/events" "github.com/aws/aws-lambda-go/lambda" awsconfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/dynamodb" @@ -53,7 +54,7 @@ func main() { Logger: logger, } - lambda.Start(func(ctx context.Context, event ilambda.StreamEvent) error { + lambda.Start(func(ctx context.Context, event ilambda.StreamEvent) (events.DynamoDBEventResponse, error) { return ilambda.HandleStreamEvent(ctx, deps, event) }) } diff --git a/internal/lambda/dryrun.go b/internal/lambda/dryrun.go index d9a716b..eff6514 100644 --- a/internal/lambda/dryrun.go +++ b/internal/lambda/dryrun.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "math" + "strings" "time" "github.com/dwsmith1983/interlock/internal/validation" @@ -227,22 +228,34 @@ func handleDryRunPostRunSensor(ctx context.Context, d *Deps, cfg *types.Pipeline return nil } + // Find matching post-run rule for this sensor key. + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break + } + } + if ruleBaseline == nil { + return nil // No baseline for this rule (stale or first run). + } + // Compare drift. driftField := resolveDriftField(cfg.PostRun) - prevCount := ExtractFloat(baseline, driftField) - currCount := ExtractFloat(sensorData, driftField) threshold := 0.0 if cfg.PostRun.DriftThreshold != nil { threshold = *cfg.PostRun.DriftThreshold } - - if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold { + dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { if pubErr := publishEvent(ctx, d, string(types.EventDryRunDrift), pipelineID, scheduleID, date, - fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, prevCount, currCount), + fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, dr.Previous, dr.Current), map[string]interface{}{ - "previousCount": prevCount, - "currentCount": currCount, - "delta": currCount - prevCount, + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, "driftThreshold": threshold, "driftField": driftField, "sensorKey": sensorKey, diff --git a/internal/lambda/e2e_test.go b/internal/lambda/e2e_test.go index 7f66514..b32008d 100644 --- a/internal/lambda/e2e_test.go +++ b/internal/lambda/e2e_test.go @@ -276,7 +276,7 @@ func runSFN(t *testing.T, ctx context.Context, d *lambda.Deps, mock *mockDDB, eb // Simulate stream event for each sensor update. sensorRecord := makeSensorRecord(pid, key, toStreamAttributes(data)) streamEvt := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{sensorRecord}} - _ = lambda.HandleStreamEvent(ctx, d, streamEvt) + _, _ = lambda.HandleStreamEvent(ctx, d, streamEvt) } } } @@ -1021,7 +1021,7 @@ func TestE2E_AutoRetries(t *testing.T) { require.NotEmpty(t, jobSK, "should have a joblog entry") sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d1", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d1", jobSK, "fail")) require.NoError(t, err) // Verify: new SFN execution started (auto-retry under maxRetries limit) @@ -1055,7 +1055,7 @@ func TestE2E_AutoRetries(t *testing.T) { eb.events = nil eb.mu.Unlock() - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d2", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d2", jobSK, "fail")) require.NoError(t, err) // Verify: no new SFN, RETRY_EXHAUSTED published, status=FAILED_FINAL @@ -1104,7 +1104,7 @@ func TestE2E_FailureClassification(t *testing.T) { eb.events = nil eb.mu.Unlock() - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc1", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc1", jobSK, "fail")) require.NoError(t, err) // Verify: no retry (MaxCodeRetries=0), RETRY_EXHAUSTED event @@ -1140,7 +1140,7 @@ func TestE2E_FailureClassification(t *testing.T) { require.NotEmpty(t, jobSK) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc2", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc2", jobSK, "fail")) require.NoError(t, err) sfnM.mu.Lock() @@ -1184,7 +1184,7 @@ func TestE2E_RerunReplay(t *testing.T) { // Process RERUN_REQUEST stream event sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e1")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e1")) require.NoError(t, err) // Verify: new SFN started, rerun-accepted joblog written @@ -1221,7 +1221,7 @@ func TestE2E_RerunReplay(t *testing.T) { })) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e2")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e2")) require.NoError(t, err) // Verify: no SFN, RERUN_REJECTED published @@ -1257,7 +1257,7 @@ func TestE2E_RerunReplay(t *testing.T) { "status": events.NewStringAttribute("ready"), "date": events.NewStringAttribute("2026-03-07"), }) - err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + _, err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) require.NoError(t, err) // Lock already held → late data path @@ -1306,7 +1306,7 @@ func TestE2E_DriftRetrigger(t *testing.T) { // Phase 2: Stream-router processes RERUN_REQUEST sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f1")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f1")) require.NoError(t, err) // Verify: new SFN started for re-trigger @@ -1343,7 +1343,7 @@ func TestE2E_DriftRetrigger(t *testing.T) { assert.Contains(t, r.events, "POST_RUN_DRIFT") sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f2")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f2")) require.NoError(t, err) sfnM.mu.Lock() @@ -1380,7 +1380,7 @@ func TestE2E_DriftRetrigger(t *testing.T) { // Phase 2: verify the RERUN_REQUEST was written, allowing re-trigger sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f3")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f3")) require.NoError(t, err) sfnM.mu.Lock() @@ -1432,7 +1432,7 @@ func TestE2E_RerunLimits(t *testing.T) { // Send a data-drift RERUN_REQUEST — should be rejected sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl1", "data-drift")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl1", "data-drift")) require.NoError(t, err) // Verify: no SFN started, RERUN_REJECTED event + joblog entry @@ -1480,7 +1480,7 @@ func TestE2E_RerunLimits(t *testing.T) { // Send a late-data RERUN_REQUEST — should be rejected because // late-data shares the drift budget (count 1 >= budget 1) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl2", "late-data")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl2", "late-data")) require.NoError(t, err) // Verify: no SFN started, RERUN_REJECTED event + joblog entry @@ -1791,7 +1791,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) { }) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + _, err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) require.NoError(t, err) // Verify: SFN started, trigger lock acquired, JOB_TRIGGERED event published. @@ -1824,7 +1824,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) { }) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-i2", jobSK, types.JobEventTimeout)) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-i2", jobSK, types.JobEventTimeout)) require.NoError(t, err) // Verify: auto-retry started (timeout is retryable just like fail). @@ -1861,7 +1861,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) { })) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-i3")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-i3")) require.NoError(t, err) // Verify: rerun accepted despite old sensor data (failure skips freshness check). @@ -2067,32 +2067,37 @@ func TestE2E_RerunBudgetSeparation(t *testing.T) { // Phase 1: First drift rerun — accepted (0 < budget 1). sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift"))) + _, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first drift rerun should start SFN") // Phase 2: Second drift rerun — rejected (1 >= budget 1). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "second drift rerun should NOT start SFN") assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED") // Phase 3: First manual rerun — accepted despite drift budget exhausted. resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first manual rerun should succeed") // Phase 4: Second manual rerun — accepted (1 < budget 2). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "second manual rerun should succeed") // Phase 5: Third manual rerun — rejected (2 >= budget 2). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "third manual rerun should NOT start SFN") assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED") assertAlertFormats(t, eb) @@ -2119,16 +2124,17 @@ func TestE2E_PostRunInflight(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "pipe-inf1", "2026-03-07", types.TriggerStatusRunning) - // Baseline from a previous run. + // Baseline from a previous run (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf1", "postrun-baseline#2026-03-07", - map[string]interface{}{"sensor_count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}})) // Sensor arrives with different count while job is running. record := makeSensorRecord("pipe-inf1", "audit-result", toStreamAttributes(map[string]interface{}{ "sensor_count": float64(200), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT_INFLIGHT") assert.False(t, hasRerunRequest(mock, "pipe-inf1"), "should NOT write rerun request while running") @@ -2150,16 +2156,17 @@ func TestE2E_PostRunInflight(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "pipe-inf-cf", "2026-03-07", types.TriggerStatusRunning) - // Baseline uses custom field "count". + // Baseline uses custom field "count" (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf-cf", "postrun-baseline#2026-03-07", - map[string]interface{}{"count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"count": float64(100)}})) // Sensor arrives with different count while job is running. record := makeSensorRecord("pipe-inf-cf", "audit-result", toStreamAttributes(map[string]interface{}{ "count": float64(200), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT_INFLIGHT") assert.False(t, hasRerunRequest(mock, "pipe-inf-cf"), "should NOT write rerun request while running") @@ -2180,15 +2187,16 @@ func TestE2E_PostRunInflight(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "pipe-inf2", "2026-03-07", types.TriggerStatusRunning) - // Baseline matches incoming sensor — no drift. + // Baseline matches incoming sensor — no drift (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf2", "postrun-baseline#2026-03-07", - map[string]interface{}{"sensor_count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}})) record := makeSensorRecord("pipe-inf2", "audit-result", toStreamAttributes(map[string]interface{}{ "sensor_count": float64(100), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Empty(t, collectEventTypes(eb)) assert.Equal(t, 0, countSFNExecutions(sfnM)) @@ -2217,7 +2225,8 @@ func TestE2E_CalendarExclusionFullSkip(t *testing.T) { record := makeSensorRecord("pipe-cal1", "upstream-complete", map[string]events.DynamoDBAttributeValue{"status": events.NewStringAttribute("ready")}) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Equal(t, 0, countSFNExecutions(sfnM)) assertNoTriggerLock(t, mock, "pipe-cal1", "stream", today) @@ -2266,7 +2275,8 @@ func TestE2E_HourBoundaryRollover(t *testing.T) { "date": "20260307", "hour": "23", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record23}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record23}}) + require.NoError(t, handleErr) // Hour 00 (next day) sensor arrives. record00 := makeSensorRecord("pipe-hr1", "hourly-status#20260308T00", toStreamAttributes(map[string]interface{}{ @@ -2274,7 +2284,8 @@ func TestE2E_HourBoundaryRollover(t *testing.T) { "date": "20260308", "hour": "00", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record00}})) + _, handleErr = lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record00}}) + require.NoError(t, handleErr) // Two independent SFN executions. sfnM.mu.Lock() @@ -2317,28 +2328,31 @@ func TestE2E_ConcurrentDriftDedup(t *testing.T) { seedConfig(mock, cfg) seedCompletedPipelineE2E(t, ctx, d, mock, "pipe-cd1", "2026-03-07") - // Baseline captured at completion. + // Baseline captured at completion (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-cd1", "postrun-baseline#2026-03-07", - map[string]interface{}{"sensor_count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}})) // First drift sensor arrives. record1 := makeSensorRecord("pipe-cd1", "audit-result", toStreamAttributes(map[string]interface{}{ "sensor_count": float64(200), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1}}) + require.NoError(t, handleErr) assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT") // Process first rerun request — accepted. resetEventBridge(eb) sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first drift rerun accepted") // Process second rerun request — rejected (budget exhausted). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "second drift rerun rejected") assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED") assertAlertFormats(t, eb) @@ -2370,7 +2384,8 @@ func TestE2E_PostRunBeforeBaseline(t *testing.T) { "sensor_count": float64(500), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Empty(t, collectEventTypes(eb), "should not publish any event when baseline is missing") assert.False(t, hasRerunRequest(mock, "pipe-nb1")) @@ -2412,7 +2427,8 @@ func TestE2E_RerunAfterTriggerTTLExpiry(t *testing.T) { })) sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl1", "manual"))) + _, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl1", "manual")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "no SFN when trigger lock row was deleted by TTL") // Should have published an INFRA_FAILURE event. @@ -2457,7 +2473,8 @@ func TestE2E_RerunAfterTriggerTTLExpiry(t *testing.T) { }) sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl2", "manual"))) + _, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl2", "manual")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "rerun should start SFN when trigger lock exists") assertAlertFormats(t, eb) }) diff --git a/internal/lambda/orchestrator.go b/internal/lambda/orchestrator.go index 5ba53ab..3035d90 100644 --- a/internal/lambda/orchestrator.go +++ b/internal/lambda/orchestrator.go @@ -263,15 +263,19 @@ func RemapPerPeriodSensors(sensors map[string]map[string]interface{}, date strin if compact != date { suffixes = append(suffixes, "#"+compact) } + additions := make(map[string]map[string]interface{}) for key, data := range sensors { for _, suffix := range suffixes { if strings.HasSuffix(key, suffix) { base := strings.TrimSuffix(key, suffix) - sensors[base] = data + additions[base] = data break } } } + for k, v := range additions { + sensors[k] = v + } } // handleTriggerExhausted publishes RETRY_EXHAUSTED when trigger retries are @@ -357,13 +361,12 @@ func capturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID RemapPerPeriodSensors(sensors, date) - // Build baseline from post-run rule keys. + // Build baseline from post-run rule keys, namespaced by rule key + // to prevent field name collisions between different sensors. baseline := make(map[string]interface{}) for _, rule := range cfg.PostRun.Rules { if data, ok := sensors[rule.Key]; ok { - for k, v := range data { - baseline[k] = v - } + baseline[rule.Key] = data } } @@ -498,7 +501,7 @@ func InjectDateArgs(tc *types.TriggerConfig, date string) { if hourPart != "" { payload["par_hour"] = hourPart } - b, _ := json.Marshal(payload) + b, _ := json.Marshal(payload) // json.Marshal is infallible for map[string]string (no channels, funcs, or complex types) tc.HTTP.Body = string(b) } } diff --git a/internal/lambda/orchestrator_unit_test.go b/internal/lambda/orchestrator_unit_test.go index d6ce194..cd5d343 100644 --- a/internal/lambda/orchestrator_unit_test.go +++ b/internal/lambda/orchestrator_unit_test.go @@ -93,6 +93,49 @@ func TestInjectDateArgs(t *testing.T) { }) } +// --------------------------------------------------------------------------- +// BUG-2 characterization: RemapPerPeriodSensors map mutation during range +// --------------------------------------------------------------------------- + +func TestRemapPerPeriodSensors_MultipleSuffixes_MapMutation(t *testing.T) { + // BUG-2 characterization: adding keys during range iteration. + // With Go's map iteration, newly inserted keys may or may not be visited. + // This test documents the nondeterministic behavior. + sensors := map[string]map[string]interface{}{ + "hourly-status#2026-03-13": {"count": float64(10)}, + "daily-check#2026-03-13": {"count": float64(20)}, + "weekly-scan#20260313": {"count": float64(30)}, + } + lambda.RemapPerPeriodSensors(sensors, "2026-03-13") + // All base keys should be present + assert.NotNil(t, sensors["hourly-status"], "hourly-status base key should exist") + assert.NotNil(t, sensors["daily-check"], "daily-check base key should exist") + assert.NotNil(t, sensors["weekly-scan"], "weekly-scan base key should exist") +} + +func TestRemapPerPeriodSensors_StagedMerge_NoCrossContamination(t *testing.T) { + // Verify staged merge doesn't allow newly-added base keys to match + // as suffixed keys in the same iteration. + sensors := map[string]map[string]interface{}{ + "hourly-status#2026-03-13": {"count": float64(10)}, + } + lambda.RemapPerPeriodSensors(sensors, "2026-03-13") + assert.NotNil(t, sensors["hourly-status"]) + assert.Equal(t, float64(10), sensors["hourly-status"]["count"]) + // Original suffixed key should still exist + assert.NotNil(t, sensors["hourly-status#2026-03-13"]) +} + +// --------------------------------------------------------------------------- +// BUG-10 characterization: baseline flattening collision +// --------------------------------------------------------------------------- + +func TestExtractFloat_ZeroValueIndistinguishableFromMissing(t *testing.T) { + // BUG-1 characterization: ExtractFloat returns 0 for both zero and missing. + assert.Equal(t, float64(0), lambda.ExtractFloat(map[string]interface{}{"count": float64(0)}, "count")) + assert.Equal(t, float64(0), lambda.ExtractFloat(map[string]interface{}{}, "count")) +} + // --------------------------------------------------------------------------- // RemapPerPeriodSensors — table-driven // --------------------------------------------------------------------------- diff --git a/internal/lambda/postrun.go b/internal/lambda/postrun.go index 7b8beae..727badd 100644 --- a/internal/lambda/postrun.go +++ b/internal/lambda/postrun.go @@ -3,7 +3,6 @@ package lambda import ( "context" "fmt" - "math" "strings" "github.com/dwsmith1983/interlock/internal/validation" @@ -61,7 +60,7 @@ func handlePostRunSensorEvent(ctx context.Context, d *Deps, cfg *types.PipelineC case types.TriggerStatusCompleted: // Job completed — full post-run evaluation with baseline comparison. - return handlePostRunCompleted(ctx, d, cfg, pipelineID, scheduleID, date, sensorData) + return handlePostRunCompleted(ctx, d, cfg, pipelineID, scheduleID, date, sensorKey, sensorData) default: // FAILED_FINAL or unknown — skip. @@ -82,20 +81,33 @@ func handlePostRunInflight(ctx context.Context, d *Deps, cfg *types.PipelineConf return nil // No baseline yet — job hasn't completed once. } + // Find matching post-run rule for this sensor key. + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break + } + } + if ruleBaseline == nil { + return nil // No baseline for this rule (stale or first run). + } + driftField := resolveDriftField(cfg.PostRun) - prevCount := ExtractFloat(baseline, driftField) - currCount := ExtractFloat(sensorData, driftField) threshold := 0.0 if cfg.PostRun.DriftThreshold != nil { threshold = *cfg.PostRun.DriftThreshold } - if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold { + dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { if err := publishEvent(ctx, d, string(types.EventPostRunDriftInflight), pipelineID, scheduleID, date, - fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, prevCount, currCount), + fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, dr.Previous, dr.Current), map[string]interface{}{ - "previousCount": prevCount, - "currentCount": currCount, - "delta": currCount - prevCount, + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, "driftThreshold": threshold, "driftField": driftField, "sensorKey": sensorKey, @@ -110,7 +122,7 @@ func handlePostRunInflight(ctx context.Context, d *Deps, cfg *types.PipelineConf // handlePostRunCompleted evaluates post-run rules after the job has completed. // Compares sensor values against the date-scoped baseline and triggers a rerun // if drift is detected. -func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date string, sensorData map[string]interface{}) error { +func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date, sensorKey string, sensorData map[string]interface{}) error { // Read baseline captured at trigger completion. baselineKey := "postrun-baseline#" + date baseline, err := d.Store.GetSensorData(ctx, pipelineID, baselineKey) @@ -120,44 +132,56 @@ func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineCon // Check for data drift if baseline exists. if baseline != nil { - driftField := resolveDriftField(cfg.PostRun) - prevCount := ExtractFloat(baseline, driftField) - currCount := ExtractFloat(sensorData, driftField) - threshold := 0.0 - if cfg.PostRun.DriftThreshold != nil { - threshold = *cfg.PostRun.DriftThreshold - } - if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold { - delta := currCount - prevCount - if err := publishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date, - fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, prevCount, currCount), - map[string]interface{}{ - "previousCount": prevCount, - "currentCount": currCount, - "delta": delta, - "driftThreshold": threshold, - "driftField": driftField, - "source": "post-run-stream", - }); err != nil { - d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDrift, "error", err) + // Find matching post-run rule for this sensor key. + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break } + } - // Trigger rerun via the existing circuit breaker path only if the - // execution date is not excluded by the pipeline's calendar config. - if isExcludedDate(cfg, date) { - if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date, - fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { - d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + if ruleBaseline != nil { + driftField := resolveDriftField(cfg.PostRun) + threshold := 0.0 + if cfg.PostRun.DriftThreshold != nil { + threshold = *cfg.PostRun.DriftThreshold + } + dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { + if err := publishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date, + fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, dr.Previous, dr.Current), + map[string]interface{}{ + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, + "driftThreshold": threshold, + "driftField": driftField, + "sensorKey": sensorKey, + "source": "post-run-stream", + }); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDrift, "error", err) } - d.Logger.InfoContext(ctx, "post-run drift rerun skipped: execution date excluded by calendar", - "pipelineId", pipelineID, "date", date) - } else { - if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, scheduleID, date, "data-drift"); writeErr != nil { - d.Logger.WarnContext(ctx, "failed to write rerun request on post-run drift", - "pipelineId", pipelineID, "error", writeErr) + + // Trigger rerun via the existing circuit breaker path only if the + // execution date is not excluded by the pipeline's calendar config. + if isExcludedDate(cfg, date) { + if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date, + fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + } + d.Logger.InfoContext(ctx, "post-run drift rerun skipped: execution date excluded by calendar", + "pipelineId", pipelineID, "date", date) + } else { + if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, scheduleID, date, "data-drift"); writeErr != nil { + d.Logger.WarnContext(ctx, "failed to write rerun request on post-run drift", + "pipelineId", pipelineID, "error", writeErr) + } } + return nil } - return nil } } diff --git a/internal/lambda/rerun.go b/internal/lambda/rerun.go index 704b39f..e44635c 100644 --- a/internal/lambda/rerun.go +++ b/internal/lambda/rerun.go @@ -132,34 +132,38 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even return nil } - // --- Acceptance: write rerun record FIRST (before lock reset) --- - if _, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, reason, ""); err != nil { - return fmt.Errorf("write rerun for %q: %w", pipelineID, err) - } - - // Delete date-scoped postrun-baseline so re-run captures fresh baseline. - if cfg.PostRun != nil { - if err := d.Store.DeleteSensor(ctx, pipelineID, "postrun-baseline#"+date); err != nil { - d.Logger.Warn("failed to delete postrun-baseline sensor", "error", err, "pipeline", pipelineID, "date", date) - } - } - - // Atomically reset the trigger lock for the new execution. + // --- Acceptance: acquire lock FIRST (before writing rerun) --- acquired, err := d.Store.ResetTriggerLock(ctx, pipelineID, schedule, date, ResolveTriggerLockTTL()) if err != nil { return fmt.Errorf("reset trigger lock for %q: %w", pipelineID, err) } if !acquired { if pubErr := publishEvent(ctx, d, string(types.EventInfraFailure), pipelineID, schedule, date, - fmt.Sprintf("lock reset failed for rerun of %s, orphaned rerun record", pipelineID)); pubErr != nil { + fmt.Sprintf("lock reset failed for rerun of %s", pipelineID)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "error", pubErr) } - d.Logger.Warn("failed to reset trigger lock, orphaned rerun record", + d.Logger.Warn("failed to reset trigger lock for rerun", "pipelineId", pipelineID, "schedule", schedule, "date", date) return nil } - // Publish acceptance event only after lock atomicity is confirmed. + // Delete date-scoped postrun-baseline so re-run captures fresh baseline. + if cfg.PostRun != nil { + if err := d.Store.DeleteSensor(ctx, pipelineID, "postrun-baseline#"+date); err != nil { + d.Logger.Warn("failed to delete postrun-baseline sensor", "error", err, "pipeline", pipelineID, "date", date) + } + } + + // Write rerun record AFTER lock is confirmed. + if _, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, reason, ""); err != nil { + // Lock acquired but write failed — release lock to avoid deadlock. + if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil { + d.Logger.Warn("failed to release lock after rerun write failure", "error", relErr) + } + return fmt.Errorf("write rerun for %q: %w", pipelineID, err) + } + + // Publish acceptance event only after lock and rerun record confirmed. if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, types.JobEventRerunAccepted, "", 0, ""); err != nil { d.Logger.Warn("failed to write rerun-accepted joblog", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) @@ -350,6 +354,13 @@ func checkSensorFreshness(ctx context.Context, d *Deps, pipelineID, jobSK string continue } + // Normalize epoch: if ts looks like seconds (< 1e12), convert to millis. + // Epoch millis won't be < 1e12 until ~2001, and epoch seconds won't + // exceed 1e12 until ~33658 CE. + if ts > 0 && ts < 1e12 { + ts *= 1000 + } + if ts > jobTimestamp { return true, nil // Data changed after job — allow rerun. } diff --git a/internal/lambda/stream_router.go b/internal/lambda/stream_router.go index 66bedb5..0344928 100644 --- a/internal/lambda/stream_router.go +++ b/internal/lambda/stream_router.go @@ -50,18 +50,23 @@ func getValidatedConfig(ctx context.Context, d *Deps, pipelineID string) (*types } // HandleStreamEvent processes a DynamoDB stream event, routing each record -// to the appropriate handler based on the SK prefix. Errors are logged but -// do not fail the batch (returns nil) to prevent infinite retries. -func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) error { +// to the appropriate handler based on the SK prefix. Per-record errors are +// collected as BatchItemFailures so the Lambda runtime can use DynamoDB's +// ReportBatchItemFailures to retry only the failed records. +func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) (events.DynamoDBEventResponse, error) { + var resp events.DynamoDBEventResponse for i := range event.Records { if err := handleRecord(ctx, d, event.Records[i]); err != nil { d.Logger.Error("stream record error", "error", err, "eventID", event.Records[i].EventID, ) + resp.BatchItemFailures = append(resp.BatchItemFailures, events.DynamoDBBatchItemFailure{ + ItemIdentifier: event.Records[i].EventID, + }) } } - return nil + return resp, nil } // handleRecord extracts PK/SK and routes to the appropriate handler. diff --git a/internal/lambda/stream_router_test.go b/internal/lambda/stream_router_test.go index 09ab196..f3f7e7d 100644 --- a/internal/lambda/stream_router_test.go +++ b/internal/lambda/stream_router_test.go @@ -141,7 +141,7 @@ func TestStreamRouter_SensorMatch_StartsSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -182,7 +182,7 @@ func TestStreamRouter_SensorPrefixMatch_PerPeriodKey(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -207,7 +207,7 @@ func TestStreamRouter_SensorNoMatch_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -231,7 +231,7 @@ func TestStreamRouter_SensorMatch_LockHeld_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -256,7 +256,7 @@ func TestStreamRouter_CalendarExcluded_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -274,7 +274,7 @@ func TestStreamRouter_NoPipelineConfig_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -301,7 +301,7 @@ func TestStreamRouter_ConfigChange_InvalidatesCache(t *testing.T) { sensorRecord := makeSensorRecord("gold-revenue", "upstream-complete", map[string]events.DynamoDBAttributeValue{ "status": events.NewStringAttribute("ready"), }) - err := lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ + _, err := lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ Records: []events.DynamoDBEventRecord{sensorRecord}, }) require.NoError(t, err) @@ -316,13 +316,13 @@ func TestStreamRouter_ConfigChange_InvalidatesCache(t *testing.T) { // Send a CONFIG change event to invalidate the cache. configRecord := makeConfigRecord("gold-revenue") - err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ + _, err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ Records: []events.DynamoDBEventRecord{configRecord}, }) require.NoError(t, err) // Now send the sensor event again — should trigger SFN with the updated config. - err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ + _, err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ Records: []events.DynamoDBEventRecord{sensorRecord}, }) require.NoError(t, err) @@ -401,7 +401,7 @@ func TestStreamRouter_JobFail_UnderRetryLimit_Reruns(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have started a new SFN execution for the rerun. @@ -432,7 +432,7 @@ func TestStreamRouter_JobFail_OverRetryLimit_Alerts(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution should be started. @@ -457,7 +457,7 @@ func TestStreamRouter_JobSuccess_PublishesEvent(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution for success. @@ -484,7 +484,7 @@ func TestStreamRouter_JobTimeout_TreatedAsFailure(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventTimeout) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -518,7 +518,7 @@ func TestStreamRouter_JobFail_DriftRerunsIgnored(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -534,7 +534,7 @@ func TestStreamRouter_JobFail_NoConfig_Skips(t *testing.T) { record := makeJobRecord("unknown-pipeline", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -559,7 +559,7 @@ func TestStreamRouter_TriggerValueMismatch_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -586,7 +586,7 @@ func TestStreamRouter_SensorMatch_RecordsFirstSensorArrival(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Verify SFN was started (lock acquired). @@ -628,7 +628,7 @@ func TestStreamRouter_SensorMatch_FirstArrivalIdempotent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Verify original arrival time is preserved (not overwritten). @@ -673,7 +673,7 @@ func TestStreamRouter_LateDataArrival_CompletedSuccess(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution (lock held). @@ -717,7 +717,7 @@ func TestStreamRouter_LateDataArrival_WritesRerunRequest(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have published LATE_DATA_ARRIVAL event (existing behavior). @@ -756,7 +756,7 @@ func TestStreamRouter_LateDataArrival_StillRunning_Silent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -797,7 +797,7 @@ func TestStreamRouter_LateDataArrival_CompletedFailed_Silent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No late data event — pipeline didn't succeed. @@ -902,6 +902,27 @@ func seedJobEvent(mock *mockDDB, timestamp, event string) { } // seedSensor inserts a sensor record with a data map into the mock control table. +// toAttributeValue converts a Go value to a DynamoDB attribute value, supporting +// nested maps for namespaced baseline format. +func toAttributeValue(v interface{}) ddbtypes.AttributeValue { + switch val := v.(type) { + case string: + return &ddbtypes.AttributeValueMemberS{Value: val} + case float64: + return &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%g", val)} + case int64: + return &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", val)} + case map[string]interface{}: + nested := make(map[string]ddbtypes.AttributeValue, len(val)) + for nk, nv := range val { + nested[nk] = toAttributeValue(nv) + } + return &ddbtypes.AttributeValueMemberM{Value: nested} + default: + return &ddbtypes.AttributeValueMemberS{Value: fmt.Sprintf("%v", val)} + } +} + func seedSensor(mock *mockDDB, pipelineID, sensorKey string, data map[string]interface{}) { item := map[string]ddbtypes.AttributeValue{ "PK": &ddbtypes.AttributeValueMemberS{Value: types.PipelinePK(pipelineID)}, @@ -910,14 +931,7 @@ func seedSensor(mock *mockDDB, pipelineID, sensorKey string, data map[string]int if data != nil { dataAV := make(map[string]ddbtypes.AttributeValue, len(data)) for k, v := range data { - switch val := v.(type) { - case string: - dataAV[k] = &ddbtypes.AttributeValueMemberS{Value: val} - case float64: - dataAV[k] = &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%g", val)} - case int64: - dataAV[k] = &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", val)} - } + dataAV[k] = toAttributeValue(v) } item["data"] = &ddbtypes.AttributeValueMemberM{Value: dataAV} } @@ -938,7 +952,7 @@ func TestStreamRouter_RerunRequest_FailedJob_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have started a new SFN execution. @@ -968,7 +982,7 @@ func TestStreamRouter_RerunRequest_SuccessDataChanged_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Data changed — SFN should start. @@ -984,20 +998,22 @@ func TestStreamRouter_RerunRequest_SuccessDataUnchanged_Rejected(t *testing.T) { cfg := testJobConfig() seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") - // Seed a successful job event with timestamp 2000000. - seedJobEvent(mock, "2000000", types.JobEventSuccess) + // Use millis-range timestamps so epoch normalization (ts < 1e12 → ts*1000) + // does not distort the comparison. + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) - // Seed a sensor with updatedAt BEFORE the job timestamp. + // Seed a sensor with updatedAt BEFORE the job timestamp (both in millis). seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ - "updatedAt": float64(1000000), // older than job timestamp + "updatedAt": float64(1000000000000), // older than job timestamp "status": "ready", }) record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution — data unchanged. @@ -1026,7 +1042,7 @@ func TestStreamRouter_RerunRequest_InfraExhausted_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have started a new SFN execution. @@ -1036,6 +1052,37 @@ func TestStreamRouter_RerunRequest_InfraExhausted_Allowed(t *testing.T) { assert.Contains(t, *sfnMock.executions[0].Name, "manual-rerun") } +func TestStreamRouter_RerunRequest_SensorEpochSeconds_Normalized(t *testing.T) { + mock := newMockDDB() + d, sfnMock, _ := testDeps(mock) + + cfg := testJobConfig() + seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") + + // Seed a successful job with timestamp in millis: 2000000000000 (year ~2033). + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) + + // Seed sensor with updatedAt in SECONDS: 2000000001 (1 second after job). + // Without normalization, 2000000001 < 2000000000000 → rejected. + // With normalization, 2000000001000 > 2000000000000 → allowed. + seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ + "updatedAt": float64(2000000001), // seconds epoch + "status": "ready", + }) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + _ = resp + + sfnMock.mu.Lock() + defer sfnMock.mu.Unlock() + require.Len(t, sfnMock.executions, 1, "sensor with epoch-seconds updatedAt should be normalized and allow rerun") +} + // --------------------------------------------------------------------------- // handleRecord routing: unknown SK prefix // --------------------------------------------------------------------------- @@ -1056,7 +1103,7 @@ func TestStreamRouter_UnknownSKPrefix_Silent(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1072,15 +1119,18 @@ func TestStreamRouter_MissingPKOrSK_LogsError(t *testing.T) { mock := newMockDDB() d, _, _ := testDeps(mock) - // Record with no keys at all — should log error but HandleStreamEvent returns nil. + // Record with no keys at all — handleRecord returns error, collected as batch failure. record := events.DynamoDBEventRecord{ + EventID: "missing-keys-1", EventName: "INSERT", Change: events.DynamoDBStreamRecord{}, } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent always returns nil; errors are logged") + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err, "HandleStreamEvent never returns a top-level error") + require.Len(t, resp.BatchItemFailures, 1) + assert.Equal(t, "missing-keys-1", resp.BatchItemFailures[0].ItemIdentifier) } // --------------------------------------------------------------------------- @@ -1105,7 +1155,7 @@ func TestLateData_TriggerNil(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No late data event — GetTrigger returned nil (trigger row doesn't match COMPLETED). @@ -1139,7 +1189,7 @@ func TestSensor_NoTriggerCondition(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1160,7 +1210,7 @@ func TestSensor_SensorKeyMismatch(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1183,9 +1233,10 @@ func TestSensor_StartSFNError(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // HandleStreamEvent logs errors but always returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent swallows errors") + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1, "SFN error should produce a batch item failure") // SFN was called but failed. sfnMock.mu.Lock() @@ -1209,8 +1260,9 @@ func TestSensor_StartSFNError_ReleasesLock(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent swallows errors") + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1, "SFN error should produce a batch item failure") // The trigger lock must have been released after SFN failure. // Schedule ID for stream-triggered pipelines is "stream". @@ -1237,7 +1289,7 @@ func TestSensor_PerHour_DateOnly(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1262,7 +1314,7 @@ func TestSensor_PerHour_NoDate(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1290,7 +1342,7 @@ func TestRerun_NoJobRecord_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1307,7 +1359,7 @@ func TestRerun_NoConfig_Skips(t *testing.T) { record := makeDefaultRerunRequestRecord() // uses "gold-revenue" event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1337,9 +1389,10 @@ func TestRerun_ParseSKError(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // Error is logged, HandleStreamEvent returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } func TestRerun_TimeoutJob_Allowed(t *testing.T) { @@ -1356,7 +1409,7 @@ func TestRerun_TimeoutJob_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1379,7 +1432,7 @@ func TestRerun_UnknownJobEvent_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1404,7 +1457,7 @@ func TestRerun_StartSFNError(t *testing.T) { event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} // Error is logged, HandleStreamEvent still returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1430,7 +1483,7 @@ func TestSensorFreshness_NoSensors(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No sensors → can't prove unchanged → allow rerun. @@ -1458,7 +1511,7 @@ func TestSensorFreshness_NoUpdatedAtField(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No updatedAt → can't prove unchanged → allow rerun. @@ -1486,7 +1539,7 @@ func TestSensorFreshness_FreshSensor_Float(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1500,19 +1553,21 @@ func TestSensorFreshness_StaleSensor_Float(t *testing.T) { cfg := testJobConfig() seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") - // Seed a successful job event with timestamp 2000000. - seedJobEvent(mock, "2000000", types.JobEventSuccess) + // Use millis-range timestamps so epoch normalization (ts < 1e12 → ts*1000) + // does not distort the comparison. + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) - // Seed a sensor with updatedAt as float64 < jobTimestamp. + // Seed a sensor with updatedAt as float64 < jobTimestamp (both in millis). seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ - "updatedAt": float64(1000000), + "updatedAt": float64(1000000000000), }) record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1544,7 +1599,7 @@ func TestSensorFreshness_FreshSensor_String(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1576,7 +1631,7 @@ func TestSensorFreshness_InvalidJobSK(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Invalid job SK → can't parse timestamp → allow to be safe. @@ -1608,7 +1663,7 @@ func TestSensorFreshness_InvalidTimestamp(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Non-numeric timestamp → allow to be safe. @@ -1633,7 +1688,7 @@ func TestJobLog_InfraExhaustedEvent(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventInfraTriggerExhausted) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1656,7 +1711,7 @@ func TestJobLog_OtherEvent(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventRerunAccepted) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1691,9 +1746,10 @@ func TestJobLog_ParseSKError(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // Error is logged, HandleStreamEvent returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } func TestJobLog_MissingEventAttribute(t *testing.T) { @@ -1722,7 +1778,7 @@ func TestJobLog_MissingEventAttribute(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Missing event attribute → logged as warning, no action. @@ -1749,7 +1805,7 @@ func TestJobSuccess_PublishesJobCompleted(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) ebMock.mu.Lock() @@ -1777,7 +1833,7 @@ func TestJobFailure_NoConfig(t *testing.T) { record := makeJobRecord("unknown-pipeline", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1811,7 +1867,7 @@ func TestBuildSFNConfig_NoPostRunFields(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1847,7 +1903,7 @@ func TestBuildSFNConfig_CustomTimings(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1879,7 +1935,7 @@ func TestBuildSFNConfig_WithSLA(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1910,7 +1966,7 @@ func TestBuildSFNConfig_JobPollWindowDefault(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1937,7 +1993,7 @@ func TestBuildSFNConfig_JobPollWindowOverride(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1964,7 +2020,7 @@ func TestBuildSFNConfig_JobPollWindowZeroUsesDefault(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2008,7 +2064,7 @@ func TestExtractSensorData_DataMapUnwrap(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Trigger should fire because "data" map was unwrapped, exposing "status" = "ready". @@ -2030,7 +2086,7 @@ func TestExtractSensorData_NoDataMap(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2064,7 +2120,7 @@ func TestExtractSensorData_SkipsPKSKTTL(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // PK, SK, ttl should be stripped; "status" remains for trigger evaluation. @@ -2089,7 +2145,7 @@ func TestConvertAV_String(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2125,7 +2181,7 @@ func TestConvertAV_Number(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2161,7 +2217,7 @@ func TestConvertAV_Bool(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2198,7 +2254,7 @@ func TestConvertAV_Map(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // The data map gets unwrapped; "status" should be accessible at top level. @@ -2236,7 +2292,7 @@ func TestConvertAV_List(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2270,7 +2326,7 @@ func TestConvertAV_Null(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2316,7 +2372,7 @@ func TestResolveScheduleID_StreamTriggered(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2341,7 +2397,7 @@ func TestResolveScheduleID_CronTriggered(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2368,12 +2424,13 @@ func TestPublishEvent_EventBridgeError(t *testing.T) { ebMock.err = fmt.Errorf("EventBridge throttled") // JobSuccess publishes an event — if EventBridge fails, handleJobSuccess returns error, - // but HandleStreamEvent logs it and returns nil. + // collected as a batch item failure. record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent swallows errors") + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } func TestPublishEvent_NilEventBridge(t *testing.T) { @@ -2389,7 +2446,7 @@ func TestPublishEvent_NilEventBridge(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2406,7 +2463,7 @@ func TestPublishEvent_EmptyEventBusName(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2429,7 +2486,7 @@ func TestIsExcluded_WeekendExclusion(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2469,7 +2526,7 @@ func TestStreamRouter_MultipleRecords(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1, record2}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2507,9 +2564,10 @@ func TestJobLog_UnexpectedPKFormat(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // Error is logged, HandleStreamEvent returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } // --------------------------------------------------------------------------- @@ -2537,7 +2595,7 @@ func TestRerun_UnexpectedPKFormat(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2560,7 +2618,7 @@ func TestSensor_UnexpectedPKFormat(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2623,7 +2681,7 @@ func TestRerun_DriftLimitExceeded(t *testing.T) { record := makeRerunRequestWithReason("data-drift") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN — drift limit exceeded. @@ -2651,7 +2709,7 @@ func TestRerun_ManualLimitExceeded(t *testing.T) { record := makeRerunRequestWithReason("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN — manual limit exceeded. @@ -2681,7 +2739,7 @@ func TestRerun_DriftUnderLimit(t *testing.T) { record := makeRerunRequestWithReason("data-drift") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN should have started — under drift limit. @@ -2706,7 +2764,7 @@ func TestRerun_LateDataCountsAsDrift(t *testing.T) { record := makeRerunRequestWithReason("data-drift") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2733,7 +2791,7 @@ func TestRerun_WritesRerunBeforeLockRelease(t *testing.T) { record := makeRerunRequestWithReason("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN should have started. @@ -2785,7 +2843,7 @@ func TestRerun_DeletesPostrunBaseline(t *testing.T) { record := makeRerunRequestWithReason("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN should have started. @@ -2820,7 +2878,7 @@ func TestStreamRouter_JobFail_PermanentUsesCodeRetries(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // MaxCodeRetries=0 → immediate FAILED_FINAL, no SFN started @@ -2854,7 +2912,7 @@ func TestStreamRouter_JobFail_TransientUsesMaxRetries(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // TRANSIENT uses MaxRetries=3, no reruns yet → should retry @@ -2883,7 +2941,7 @@ func TestStreamRouter_JobFail_EmptyCategoryUsesMaxRetries(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No category → uses MaxRetries=3, no reruns → should retry @@ -2979,9 +3037,9 @@ func TestPostRunSensor_Completed_DriftDetected(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) - // Seed baseline captured at completion time. + // Seed baseline captured at completion time (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) // Sensor arrives with different count → drift. @@ -2992,7 +3050,7 @@ func TestPostRunSensor_Completed_DriftDetected(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should publish POST_RUN_DRIFT event. @@ -3023,9 +3081,9 @@ func TestPostRunSensor_Completed_NoDrift_RulesPass(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) - // Baseline with same count as incoming sensor. + // Baseline with same count as incoming sensor (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(150), + "audit-result": map[string]interface{}{"sensor_count": float64(150)}, }) // Seed the actual sensor so EvaluateRules can find it. seedSensor(mock, "gold-revenue", "audit-result", map[string]interface{}{ @@ -3039,7 +3097,7 @@ func TestPostRunSensor_Completed_NoDrift_RulesPass(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should publish POST_RUN_PASSED event. @@ -3063,9 +3121,9 @@ func TestPostRunSensor_Running_InflightDrift(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusRunning) - // Baseline from a previous run. + // Baseline from a previous run (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{ @@ -3075,7 +3133,7 @@ func TestPostRunSensor_Running_InflightDrift(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should publish informational POST_RUN_DRIFT_INFLIGHT event (no rerun). @@ -3113,7 +3171,7 @@ func TestPostRunSensor_FailedFinal_Skipped(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No post-run events should be published for FAILED_FINAL trigger. @@ -3142,7 +3200,7 @@ func TestPostRunSensor_NoTrigger_Skipped(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No events published when no trigger exists. @@ -3166,7 +3224,7 @@ func TestPostRunSensor_NoPostRunConfig_GoesToTrigger(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No error, just silently ignored. } @@ -3269,7 +3327,7 @@ func TestJobFailure_AtomicLockReset_Success(t *testing.T) { record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN must have started for the rerun. @@ -3303,7 +3361,7 @@ func TestJobFailure_LockResetFails_NoSFN(t *testing.T) { record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3326,7 +3384,7 @@ func TestRerunRequest_AtomicLockReset(t *testing.T) { record := makeRerunRequestRecordFull("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN must have started. @@ -3353,7 +3411,7 @@ func TestRerunRequest_LockResetFails_PublishesInfraFailure(t *testing.T) { record := makeRerunRequestRecordFull("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution. @@ -3395,9 +3453,10 @@ func TestJobFailure_SFNStartFails_ReleasesLock(t *testing.T) { record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // HandleStreamEvent swallows per-record errors — the handler returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) // Trigger lock must be released after SFN failure (so next attempt can acquire it). assert.False(t, triggerLockExists(mock), @@ -3420,7 +3479,7 @@ func TestRerunRequest_SFNStartFails_ReleasesLock(t *testing.T) { record := makeRerunRequestRecordFull("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) assert.False(t, triggerLockExists(mock), @@ -3557,7 +3616,7 @@ func TestRerunRequest_CalendarExclusion(t *testing.T) { record := makeDefaultRerunRequestRecord() // schedule=stream, date=2026-03-01 event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3581,7 +3640,7 @@ func TestRerunRequest_CalendarExclusion_WritesJobEvent(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) mock.mu.Lock() @@ -3622,7 +3681,7 @@ func TestRerunRequest_WeekendExclusion(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3646,7 +3705,7 @@ func TestJobFailure_CalendarExclusion(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3673,7 +3732,7 @@ func TestJobFailure_CalendarExclusion_RetryLimitBeatsExclusion(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3697,7 +3756,7 @@ func TestPostRunDrift_CalendarExclusion(t *testing.T) { seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{ @@ -3707,7 +3766,7 @@ func TestPostRunDrift_CalendarExclusion(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) ebMock.mu.Lock() @@ -3747,7 +3806,7 @@ func TestPostRunDrift_NotExcluded_WritesRerun(t *testing.T) { seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{ @@ -3757,7 +3816,7 @@ func TestPostRunDrift_NotExcluded_WritesRerun(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) rerunKey := ddbItemKey(testControlTable, types.PipelinePK("gold-revenue"), types.RerunRequestSK("stream", "2026-03-01")) @@ -3781,7 +3840,7 @@ func TestSensorEvent_CalendarExclusion_PublishesEvent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3855,7 +3914,7 @@ func TestHandleSensorEvent_DryRun_WouldTrigger(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // NO SFN execution must be started. @@ -3895,7 +3954,7 @@ func TestHandleSensorEvent_DryRun_LateData(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // NO SFN execution. @@ -3933,7 +3992,7 @@ func TestHandleSensorEvent_DryRun_SLAProjection_Met(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3988,7 +4047,7 @@ func TestHandleSensorEvent_DryRun_SLAProjection_Breach(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4042,7 +4101,7 @@ func TestHandleSensorEvent_DryRun_ValidationNotReady(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN. @@ -4093,7 +4152,7 @@ func TestHandleSensorEvent_DryRun_CapturesBaseline(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4135,7 +4194,7 @@ func TestHandleSensorEvent_DryRun_Completed_NoSLA(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4188,7 +4247,7 @@ func TestHandleSensorEvent_DryRun_Completed_WithSLA(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4243,9 +4302,9 @@ func TestDryRunPostRunSensor_DriftDetected(t *testing.T) { // Pre-seed DRY_RUN# marker (would-trigger already happened). seedDryRunMarker(mock, "gold-revenue", "stream", fixedTestDate, "2026-03-11T01:15:00Z") - // Pre-seed baseline with sensor_count=500. + // Pre-seed baseline with sensor_count=500 (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+fixedTestDate, map[string]interface{}{ - "sensor_count": float64(500), + "audit-result": map[string]interface{}{"sensor_count": float64(500)}, }) // Sensor arrives for post-run key with sensor_count=520 (drift detected). @@ -4257,7 +4316,7 @@ func TestDryRunPostRunSensor_DriftDetected(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // DRY_RUN_DRIFT event published. @@ -4297,9 +4356,9 @@ func TestDryRunPostRunSensor_NoDrift(t *testing.T) { // Pre-seed DRY_RUN# marker. seedDryRunMarker(mock, "gold-revenue", "stream", fixedTestDate, "2026-03-11T01:15:00Z") - // Baseline with sensor_count=500. + // Baseline with sensor_count=500 (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+fixedTestDate, map[string]interface{}{ - "sensor_count": float64(500), + "audit-result": map[string]interface{}{"sensor_count": float64(500)}, }) // Sensor arrives with same sensor_count=500 — no drift. @@ -4311,7 +4370,7 @@ func TestDryRunPostRunSensor_NoDrift(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No events published (no drift). @@ -4345,7 +4404,7 @@ func TestDryRunPostRunSensor_NoMarker(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No events published (no marker means no trigger happened). @@ -4371,7 +4430,7 @@ func TestRerun_DryRun_SkipsExecution(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Dry-run pipeline must NOT start an SFN execution. @@ -4398,7 +4457,7 @@ func TestJobFailure_DryRun_SkipsRerun(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Dry-run pipeline must NOT start an SFN execution. @@ -4411,3 +4470,31 @@ func TestJobFailure_DryRun_SkipsRerun(t *testing.T) { require.NoError(t, countErr) assert.Zero(t, count, "dry-run must not write rerun records on job failure") } + +// --------------------------------------------------------------------------- +// BatchItemFailures: partial error reporting +// --------------------------------------------------------------------------- + +func TestStreamRouter_BatchItemFailures_PartialError(t *testing.T) { + mock := newMockDDB() + d, _, _ := testDeps(mock) + + // Build an event with one valid record and one with empty PK (will error). + validRecord := makeSensorRecord("gold-revenue", "upstream-complete", map[string]events.DynamoDBAttributeValue{ + "status": events.NewStringAttribute("ready"), + }) + + invalidRecord := events.DynamoDBEventRecord{ + EventID: "bad-record-123", + EventName: "INSERT", + Change: events.DynamoDBStreamRecord{ + Keys: map[string]events.DynamoDBAttributeValue{}, + }, + } + + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{invalidRecord, validRecord}} + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) + assert.Equal(t, "bad-record-123", resp.BatchItemFailures[0].ItemIdentifier) +} From 3c0df9de565bbf6515fc8c4a319a6a2b7a30936e Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 22:54:55 +0700 Subject: [PATCH 07/17] docs: CHANGELOG for v0.9.2 audit remediation --- CHANGELOG.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 26008b7..cf8b545 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.2] - 2026-03-13 + +### Fixed + +- **Drift detection silently skipped zero values (BUG-1)** — `ExtractFloat` returned 0 for both missing keys and actual zero values, causing the `prevCount > 0` guard to silently skip legitimate transitions like 5000→0 or 0→5000. New `ExtractFloatOk` distinguishes absent from zero. Shared `DetectDrift` function consolidates 3 duplicated drift comparison sites. +- **RemapPerPeriodSensors map mutation during range (BUG-2)** — Adding keys during `range` iteration over a Go map is nondeterministic per the spec. Staging map now collects additions, merged after iteration. +- **Orphaned rerun burns retry budget (BUG-3)** — `handleRerunRequest` wrote the rerun record before acquiring the trigger lock. If lock acquisition failed, the rerun record was left orphaned and permanently consumed retry budget. Reordered to lock-first, then write. +- **Stream router discarded partial batch failures (BUG-4)** — `HandleStreamEvent` returned a single error, causing Lambda to retry the entire batch. Now returns `DynamoDBEventResponse` with per-record `BatchItemFailures` for partial retry via `ReportBatchItemFailures`. +- **SLA_MET published when pipeline never ran (BUG-5)** — `handleSLACancel` published SLA_MET regardless of whether a trigger existed. Now checks for trigger existence first. +- **Trigger deadline used SLA timezone instead of schedule timezone (BUG-6)** — `closeSensorTriggerWindow` read timezone from `cfg.SLA.Timezone` instead of `cfg.Schedule.Timezone`. Falls back to SLA timezone if schedule timezone is not set. +- **Validation mode case-sensitive (BUG-8)** — `EvaluateRules` matched mode with `switch mode` so "any" fell through to the default ALL branch. Now uses `strings.ToUpper(mode)`. +- **Epoch timestamp unit mismatch in rerun freshness (BUG-9)** — `checkSensorFreshness` compared raw epoch values without normalizing units. Timestamps below 1e12 (seconds) are now converted to milliseconds. +- **Post-run baseline field collision (BUG-10)** — Baseline was stored as a flat map, so two rules with the same field name overwrote each other. Now namespaced by rule key. Clean break: existing flat baselines self-heal on next pipeline completion. +- **publishEvent errors silently discarded in SLA reconcile (CQ-5)** — Replaced `_ = publishEvent(...)` with error-logged calls. + +### Security + +- **lambda_trigger_arns default changed to [] with precondition (SEC-1)** — Wildcard default removed; explicit ARN list required when triggers are enabled. +- **Slack plaintext token deprecation warning (SEC-2)** — Terraform `check` block warns at plan time when plaintext token is used without Secrets Manager. +- **Trigger IAM policy scoping (SEC-4)** — New variables `glue_job_arns`, `emr_cluster_arns`, `emr_serverless_app_arns`, `sfn_trigger_arns` (all default `[]`) with preconditions requiring non-empty values when the corresponding trigger is enabled. +- **EventBridge bus resource policy (SEC-5)** — Restricts PutEvents to Lambda execution roles only. + ## [0.9.1] - 2026-03-13 ### Added From a7ae361aef132d3e6e013eca4e774a0437b190f4 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:02:18 +0700 Subject: [PATCH 08/17] refactor: extract shared HTTP client construction (DRY-2) resolveHTTPClient(timeoutSec) replaces identical 7-line blocks in ExecuteHTTP and ExecuteAirflow. --- internal/trigger/airflow.go | 11 +---------- internal/trigger/trigger.go | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/internal/trigger/airflow.go b/internal/trigger/airflow.go index f78708a..589233e 100644 --- a/internal/trigger/airflow.go +++ b/internal/trigger/airflow.go @@ -9,7 +9,6 @@ import ( "net/http" "os" "strings" - "time" "github.com/dwsmith1983/interlock/pkg/types" ) @@ -49,15 +48,7 @@ func ExecuteAirflow(ctx context.Context, cfg *types.AirflowTriggerConfig) (map[s req.Header.Set(k, os.Expand(v, safeEnvLookup)) } - client := defaultHTTPClient - if cfg.Timeout > 0 { - timeout := time.Duration(cfg.Timeout) * time.Second - if timeout != defaultTriggerTimeout { - client = &http.Client{Timeout: timeout} - } - } - - resp, err := client.Do(req) + resp, err := resolveHTTPClient(cfg.Timeout).Do(req) if err != nil { return nil, fmt.Errorf("airflow trigger: request failed: %w", err) } diff --git a/internal/trigger/trigger.go b/internal/trigger/trigger.go index 545a70d..6006ba6 100644 --- a/internal/trigger/trigger.go +++ b/internal/trigger/trigger.go @@ -47,6 +47,19 @@ const defaultTriggerTimeout = 30 * time.Second // defaultHTTPClient is shared across HTTP and Airflow triggers to reuse connections. var defaultHTTPClient = &http.Client{Timeout: defaultTriggerTimeout} +// resolveHTTPClient returns a client with the given timeout in seconds. If +// timeoutSec is zero or matches the default, the shared defaultHTTPClient is +// returned to reuse connections. +func resolveHTTPClient(timeoutSec int) *http.Client { + if timeoutSec > 0 { + timeout := time.Duration(timeoutSec) * time.Second + if timeout != defaultTriggerTimeout { + return &http.Client{Timeout: timeout} + } + } + return defaultHTTPClient +} + // defaultRunner provides backward-compatible package-level functions. var defaultRunner = NewRunner() @@ -60,13 +73,16 @@ func CheckStatus(ctx context.Context, triggerType types.TriggerType, metadata ma return defaultRunner.CheckStatus(ctx, triggerType, metadata, headers) } -// ExecuteCommand runs a shell command trigger. +// ExecuteCommand runs a command trigger by splitting the command string into +// arguments and executing the binary directly (no shell). This prevents shell +// metacharacter injection. func ExecuteCommand(ctx context.Context, command string) error { if command == "" { return fmt.Errorf("trigger command is empty") } - cmd := exec.CommandContext(ctx, "sh", "-c", command) + args := strings.Fields(command) + cmd := exec.CommandContext(ctx, args[0], args[1:]...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr return cmd.Run() @@ -94,14 +110,7 @@ func ExecuteHTTP(ctx context.Context, cfg *types.HTTPTriggerConfig) error { req.Header.Set(k, os.Expand(v, safeEnvLookup)) } - client := defaultHTTPClient - if cfg.Timeout > 0 { - timeout := time.Duration(cfg.Timeout) * time.Second - if timeout != defaultTriggerTimeout { - client = &http.Client{Timeout: timeout} - } - } - resp, err := client.Do(req) + resp, err := resolveHTTPClient(cfg.Timeout).Do(req) if err != nil { return fmt.Errorf("trigger request failed: %w", err) } From 0badbeef0dfe5c778362d62c13a4a926c40cf5b1 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:02:26 +0700 Subject: [PATCH 09/17] refactor: extract shared SLA schedule creation loop (DRY-3) createSLASchedules() replaces duplicated warning/breach schedule creation loops in scheduleSLAAlerts (watchdog) and handleSLASchedule (sla-monitor). onConflictSkip parameter handles the differing error behavior between the two callers. --- internal/lambda/sla_monitor.go | 60 +++++++++++++++++++++------------- internal/lambda/watchdog.go | 37 +++------------------ 2 files changed, 43 insertions(+), 54 deletions(-) diff --git a/internal/lambda/sla_monitor.go b/internal/lambda/sla_monitor.go index 0131b2f..5093fa8 100644 --- a/internal/lambda/sla_monitor.go +++ b/internal/lambda/sla_monitor.go @@ -256,28 +256,8 @@ func handleSLASchedule(ctx context.Context, d *Deps, input SLAMonitorInput) (SLA return calc, nil } - for _, alert := range []struct { - suffix string - alertType string - timestamp string - }{ - {"warning", "SLA_WARNING", calc.WarningAt}, - {"breach", "SLA_BREACH", calc.BreachAt}, - } { - name := slaScheduleName(input.PipelineID, input.ScheduleID, input.Date, alert.suffix) - payload := SLAMonitorInput{ - Mode: "fire-alert", - PipelineID: input.PipelineID, - ScheduleID: input.ScheduleID, - Date: input.Date, - AlertType: alert.alertType, - } - if alert.alertType == "SLA_WARNING" { - payload.BreachAt = calc.BreachAt - } - if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil { - return SLAMonitorOutput{}, fmt.Errorf("create %s schedule: %w", alert.suffix, err) - } + if err := createSLASchedules(ctx, d, input.PipelineID, input.ScheduleID, input.Date, calc, false); err != nil { + return SLAMonitorOutput{}, err } d.Logger.InfoContext(ctx, "scheduled SLA alerts", @@ -414,6 +394,42 @@ func createOneTimeSchedule(ctx context.Context, d *Deps, name, timestamp string, return nil } +// createSLASchedules creates warning and breach one-time schedules. +// Returns an error on the first schedule creation failure. If onConflictSkip +// is true, ConflictException errors are silently skipped (idempotent retries). +func createSLASchedules(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, calc SLAMonitorOutput, onConflictSkip bool) error { + for _, alert := range []struct { + suffix string + alertType string + timestamp string + }{ + {"warning", "SLA_WARNING", calc.WarningAt}, + {"breach", "SLA_BREACH", calc.BreachAt}, + } { + name := slaScheduleName(pipelineID, scheduleID, date, alert.suffix) + payload := SLAMonitorInput{ + Mode: "fire-alert", + PipelineID: pipelineID, + ScheduleID: scheduleID, + Date: date, + AlertType: alert.alertType, + } + if alert.alertType == "SLA_WARNING" { + payload.BreachAt = calc.BreachAt + } + if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil { + if onConflictSkip { + var conflict *schedulerTypes.ConflictException + if errors.As(err, &conflict) { + continue + } + } + return fmt.Errorf("create %s schedule: %w", alert.suffix, err) + } + } + return nil +} + // handleSLAReconcile calculates deadlines and fires any alerts for deadlines // that have already passed. Fallback for environments without EventBridge // Scheduler configured. diff --git a/internal/lambda/watchdog.go b/internal/lambda/watchdog.go index 7bf5605..9fd782a 100644 --- a/internal/lambda/watchdog.go +++ b/internal/lambda/watchdog.go @@ -2,14 +2,11 @@ package lambda import ( "context" - "errors" "fmt" "strconv" "strings" "time" - schedulerTypes "github.com/aws/aws-sdk-go-v2/service/scheduler/types" - "github.com/dwsmith1983/interlock/internal/validation" "github.com/dwsmith1983/interlock/pkg/types" ) @@ -544,35 +541,11 @@ func scheduleSLAAlerts(ctx context.Context, d *Deps) error { breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) if breachAt.IsZero() || breachAt.After(now) { // SLA breach is in the future — create schedules. - var scheduleErr bool - for _, alert := range []struct { - suffix string - alertType string - timestamp string - }{ - {"warning", "SLA_WARNING", calc.WarningAt}, - {"breach", "SLA_BREACH", calc.BreachAt}, - } { - name := slaScheduleName(id, scheduleID, date, alert.suffix) - payload := SLAMonitorInput{ - Mode: "fire-alert", - PipelineID: id, - ScheduleID: scheduleID, - Date: date, - AlertType: alert.alertType, - } - if alert.alertType == "SLA_WARNING" { - payload.BreachAt = calc.BreachAt - } - if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil { - var conflict *schedulerTypes.ConflictException - if errors.As(err, &conflict) { - continue - } - d.Logger.Error("create SLA schedule failed", - "pipelineId", id, "suffix", alert.suffix, "error", err) - scheduleErr = true - } + scheduleErr := false + if err := createSLASchedules(ctx, d, id, scheduleID, date, calc, true); err != nil { + d.Logger.Error("create SLA schedule failed", + "pipelineId", id, "error", err) + scheduleErr = true } if !scheduleErr { From 8812a0a69b5b4f6ecddb26c050ecbc2b7c4fc8e2 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:02:30 +0700 Subject: [PATCH 10/17] security: replace sh -c with direct exec in command trigger (SEC-3) Eliminates shell interpretation entirely. No pipes, redirects, or variable expansion. strings.Fields splits the command into argv. Prevents command injection via crafted pipeline configs. --- internal/trigger/trigger_test.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/trigger/trigger_test.go b/internal/trigger/trigger_test.go index 9b1066d..cc45993 100644 --- a/internal/trigger/trigger_test.go +++ b/internal/trigger/trigger_test.go @@ -267,3 +267,18 @@ func TestExecuteCommand_EmptyCommand(t *testing.T) { assert.Error(t, err) assert.Contains(t, err.Error(), "command is empty") } + +func TestExecuteCommand_DirectExec(t *testing.T) { + err := ExecuteCommand(context.Background(), "echo hello") + require.NoError(t, err) +} + +func TestExecuteCommand_NoShellMetacharacters(t *testing.T) { + // The semicolon should be passed as a literal argument to echo, not + // interpreted as a shell command separator. With direct exec there is + // no shell to split on ";", so echo receives [";", "ls"] as arguments + // and prints them literally. If a shell were involved, "ls" would + // execute as a separate command. + err := ExecuteCommand(context.Background(), "echo ; ls") + require.NoError(t, err, "echo should succeed even with ; in args") +} From eb15c39c22043272395e81e0d1a9a04c74508eab Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:07:43 +0700 Subject: [PATCH 11/17] =?UTF-8?q?refactor:=20split=20watchdog.go=20into=20?= =?UTF-8?q?focused=20files=20(1079=20=E2=86=92=20~200=20lines=20each)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure refactor, no logic changes. Functions grouped by domain: - watchdog.go: HandleWatchdog entry point only (34 lines) - watchdog_stale.go: stale trigger detection + reconciliation - watchdog_missed.go: missed schedule detection (cron + inclusion) - watchdog_sla.go: SLA alerting + trigger deadlines - watchdog_postrun.go: post-run sensor monitoring + relative SLA --- internal/lambda/watchdog.go | 1047 +-------------------------- internal/lambda/watchdog_missed.go | 237 ++++++ internal/lambda/watchdog_postrun.go | 353 +++++++++ internal/lambda/watchdog_sla.go | 281 +++++++ internal/lambda/watchdog_stale.go | 209 ++++++ 5 files changed, 1081 insertions(+), 1046 deletions(-) create mode 100644 internal/lambda/watchdog_missed.go create mode 100644 internal/lambda/watchdog_postrun.go create mode 100644 internal/lambda/watchdog_sla.go create mode 100644 internal/lambda/watchdog_stale.go diff --git a/internal/lambda/watchdog.go b/internal/lambda/watchdog.go index 9fd782a..4d0658c 100644 --- a/internal/lambda/watchdog.go +++ b/internal/lambda/watchdog.go @@ -1,15 +1,6 @@ package lambda -import ( - "context" - "fmt" - "strconv" - "strings" - "time" - - "github.com/dwsmith1983/interlock/internal/validation" - "github.com/dwsmith1983/interlock/pkg/types" -) +import "context" // HandleWatchdog runs periodic health checks. It detects stale trigger // executions (Step Function timeouts) and missed cron schedules. Errors from @@ -41,1039 +32,3 @@ func HandleWatchdog(ctx context.Context, d *Deps) error { } return nil } - -// detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and -// publishes an SFN_TIMEOUT event for any that have exceeded their TTL or the -// staleTriggerThreshold. Stale triggers are moved to FAILED_FINAL status. -func detectStaleTriggers(ctx context.Context, d *Deps) error { - triggers, err := d.Store.ScanRunningTriggers(ctx) - if err != nil { - return fmt.Errorf("scan running triggers: %w", err) - } - - now := d.now() - for _, tr := range triggers { - if !isStaleTrigger(tr, now) { - continue - } - - pipelineID, schedule, date, err := parseTriggerRecord(tr) - if err != nil { - d.Logger.Warn("skipping unparseable trigger", "pk", tr.PK, "sk", tr.SK, "error", err) - continue - } - - // Dry-run pipelines should never have TRIGGER# rows, but guard - // against stale rows from pre-dry-run migrations or bugs. - if cfg, cfgErr := d.ConfigCache.Get(ctx, pipelineID); cfgErr == nil && cfg != nil && cfg.DryRun { - continue - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "actionHint": "step function exceeded TTL — check SFN execution history", - } - if tr.TTL > 0 { - alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339) - } - if err := publishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date, - fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) - } - - if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { - d.Logger.Error("failed to set trigger status to FAILED_FINAL", - "pipelineId", pipelineID, "schedule", schedule, "date", date, "error", err) - continue - } - - d.Logger.Info("detected stale trigger", - "pipelineId", pipelineID, - "schedule", schedule, - "date", date, - ) - } - return nil -} - -// isStaleTrigger returns true if the trigger's TTL has expired or if the TTL -// is zero and the trigger has been running longer than staleTriggerThreshold. -func isStaleTrigger(tr types.ControlRecord, now time.Time) bool { - if tr.TTL > 0 { - return now.Unix() > tr.TTL - } - // No TTL set — treat as stale if it has existed for longer than the threshold. - // Without a creation timestamp we can't be precise, so we conservatively - // consider it stale only when TTL is explicitly expired. - return false -} - -// parseTriggerRecord extracts pipeline ID, schedule, and date from a trigger -// ControlRecord's PK and SK. -// PK format: PIPELINE# -// SK format: TRIGGER## -func parseTriggerRecord(tr types.ControlRecord) (pipelineID, schedule, date string, err error) { - const pkPrefix = "PIPELINE#" - if !strings.HasPrefix(tr.PK, pkPrefix) { - return "", "", "", fmt.Errorf("unexpected PK format: %q", tr.PK) - } - pipelineID = tr.PK[len(pkPrefix):] - - const skPrefix = "TRIGGER#" - trimmed := strings.TrimPrefix(tr.SK, skPrefix) - if trimmed == tr.SK { - return "", "", "", fmt.Errorf("unexpected SK format: %q", tr.SK) - } - parts := strings.SplitN(trimmed, "#", 2) - if len(parts) != 2 { - return "", "", "", fmt.Errorf("invalid TRIGGER SK format: %q", tr.SK) - } - return pipelineID, parts[0], parts[1], nil -} - -// reconcileSensorTriggers re-evaluates trigger conditions for sensor-triggered -// pipelines. If a sensor meets the trigger condition but no trigger lock exists, -// the watchdog acquires the lock, starts the SFN, and publishes TRIGGER_RECOVERED. -// This self-heals missed triggers caused by silent completion-write failures. -func reconcileSensorTriggers(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - trigger := cfg.Schedule.Trigger - if trigger == nil || cfg.Schedule.Cron != "" { - continue - } - - // Dry-run pipelines are observation-only — skip reconciliation. - if cfg.DryRun { - continue - } - - if isExcluded(cfg, now) { - continue - } - - sensors, err := d.Store.GetAllSensors(ctx, id) - if err != nil { - d.Logger.Error("failed to get sensors for reconciliation", - "pipelineId", id, "error", err) - continue - } - - scheduleID := resolveScheduleID(cfg) - - for sensorKey, sensorData := range sensors { - if !strings.HasPrefix(sensorKey, trigger.Key) { - continue - } - - rule := types.ValidationRule{ - Key: trigger.Key, - Check: trigger.Check, - Field: trigger.Field, - Value: trigger.Value, - } - result := validation.EvaluateRule(rule, sensorData, now) - if !result.Passed { - continue - } - - date := ResolveExecutionDate(sensorData, now) - - found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) - if err != nil { - d.Logger.Error("trigger check failed during reconciliation", - "pipelineId", id, "date", date, "error", err) - continue - } - if found { - continue - } - - // Guard against re-triggering completed pipelines whose trigger - // record was deleted by DynamoDB TTL. Check the joblog for a - // terminal event before acquiring a new lock. - if isJobTerminal(ctx, d, id, scheduleID, date) { - continue - } - - acquired, err := d.Store.AcquireTriggerLock(ctx, id, scheduleID, date, ResolveTriggerLockTTL()) - if err != nil { - d.Logger.Error("lock acquisition failed during reconciliation", - "pipelineId", id, "date", date, "error", err) - continue - } - if !acquired { - continue - } - - if err := startSFN(ctx, d, cfg, id, scheduleID, date); err != nil { - if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil { - d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr) - } - d.Logger.Error("SFN start failed during reconciliation", - "pipelineId", id, "date", date, "error", err) - continue - } - - alertDetail := map[string]interface{}{ - "source": "reconciliation", - "actionHint": "watchdog recovered missed sensor trigger", - } - if err := publishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date, - fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date) - } - - d.Logger.Info("recovered missed trigger", - "pipelineId", id, - "schedule", scheduleID, - "date", date, - ) - } - } - return nil -} - -// lastCronFire returns the most recent expected fire time for a cron expression. -// Supports the minute-hour patterns used by this system: "MM * * * *" (hourly) -// and "MM HH * * *" (daily). Returns zero time for unsupported patterns. -func lastCronFire(cron string, now time.Time, loc *time.Location) time.Time { - fields := strings.Fields(cron) - if len(fields) < 5 { - return time.Time{} - } - minute, err := strconv.Atoi(fields[0]) - if err != nil { - return time.Time{} - } - localNow := now.In(loc) - - if fields[1] == "*" { - // Hourly: fires at :MM every hour. - candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), - localNow.Hour(), minute, 0, 0, loc) - if candidate.After(localNow) { - candidate = candidate.Add(-time.Hour) - } - return candidate - } - - hour, err := strconv.Atoi(fields[1]) - if err != nil { - return time.Time{} - } - // Daily: fires at HH:MM every day. - candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), - hour, minute, 0, 0, loc) - if candidate.After(localNow) { - candidate = candidate.Add(-24 * time.Hour) - } - return candidate -} - -// detectMissedSchedules checks all cron-scheduled pipelines to see if today's -// trigger is missing. If a pipeline should have started by now but has no -// TRIGGER# row, a SCHEDULE_MISSED event is published. -func detectMissedSchedules(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - today := now.Format("2006-01-02") - - for id, cfg := range configs { - // Only check cron-scheduled pipelines. - if cfg.Schedule.Cron == "" { - continue - } - - // Dry-run pipelines are observation-only — skip missed schedule detection. - if cfg.DryRun { - continue - } - - // Skip calendar-excluded days. - if isExcluded(cfg, now) { - continue - } - - // Only alert for schedules that should have fired after this Lambda - // started. Prevents retroactive alerts after fresh deploys. - if !d.StartedAt.IsZero() { - loc := resolveTimezone(cfg.Schedule.Timezone) - if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) { - continue - } - } - - // Resolve schedule ID for cron pipelines. - scheduleID := resolveScheduleID(cfg) - - // Check if any TRIGGER# row exists for today (covers both daily - // and per-hour trigger rows, e.g. "2026-03-04" and "2026-03-04T00"). - found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, today) - if err != nil { - d.Logger.Error("failed to check trigger for missed schedule", - "pipelineId", id, "error", err) - continue - } - if found { - continue - } - - // Check if we are past the expected start time. If the pipeline - // has a schedule time configured, only alert after that time. - if cfg.Schedule.Time != "" { - loc := resolveTimezone(cfg.Schedule.Timezone) - localNow := now.In(loc) - expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc) - if err == nil && localNow.Before(expectedStart) { - continue // not yet past expected start time - } - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "cron": cfg.Schedule.Cron, - "actionHint": fmt.Sprintf("cron %s expected to fire — no trigger found", cfg.Schedule.Cron), - } - if cfg.Schedule.Time != "" { - alertDetail["expectedTime"] = cfg.Schedule.Time - } - if err := publishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today, - fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil { - d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) - } - - d.Logger.Info("detected missed schedule", - "pipelineId", id, - "schedule", scheduleID, - "date", today, - ) - } - return nil -} - -// detectMissedInclusionSchedules checks pipelines with inclusion calendar config -// for missed schedules on irregular dates. For each pipeline with an Include -// config, it finds all past inclusion dates (capped at maxInclusionLookback) -// and verifies that a trigger exists for each. If no trigger is found and no -// dedup marker exists, an IRREGULAR_SCHEDULE_MISSED event is published. -func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 { - continue - } - - // Dry-run pipelines are observation-only — skip inclusion schedule detection. - if cfg.DryRun { - continue - } - - // Skip calendar-excluded days. - if isExcluded(cfg, now) { - continue - } - - pastDates := PastInclusionDates(cfg.Schedule.Include.Dates, now) - if len(pastDates) == 0 { - continue - } - - scheduleID := resolveScheduleID(cfg) - - // Resolve today in the pipeline's timezone so the grace-period - // guard fires correctly when UTC date != pipeline-local date. - tzLoc := resolveTimezone(cfg.Schedule.Timezone) - today := now.In(tzLoc).Format("2006-01-02") - - for _, date := range pastDates { - // If the inclusion date is today and the pipeline has a - // Schedule.Time, only alert after that time has passed. - // This mirrors the same check in detectMissedSchedules for - // cron pipelines to avoid false-positive alerts before the - // expected start time. Past dates are not gated because - // their Schedule.Time has necessarily already elapsed. - if cfg.Schedule.Time != "" && date == today { - localNow := now.In(tzLoc) - expectedStart, err := time.ParseInLocation("2006-01-02 15:04", date+" "+cfg.Schedule.Time, tzLoc) - if err == nil && localNow.Before(expectedStart) { - continue // not yet past expected start time - } - } - - // Check if a trigger exists for this inclusion date. - found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) - if err != nil { - d.Logger.Error("failed to check trigger for inclusion schedule", - "pipelineId", id, "date", date, "error", err) - continue - } - if found { - continue - } - - // Check dedup marker to avoid re-alerting on subsequent watchdog runs. - dedupKey := "irregular-missed-check#" + date - dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) - if err != nil { - d.Logger.Error("dedup marker lookup failed for inclusion schedule", - "pipelineId", id, "date", date, "error", err) - continue - } - if dedupData != nil { - continue - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date), - } - if err := publishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date, - fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date) - } - - // Write dedup marker. - if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ - "alerted": "true", - }); err != nil { - d.Logger.Warn("failed to write inclusion dedup marker", "error", err, "pipeline", id, "date", date) - } - - d.Logger.Info("detected missed inclusion schedule", - "pipelineId", id, - "schedule", scheduleID, - "date", date, - ) - } - } - return nil -} - -// scheduleSLAAlerts proactively creates EventBridge Scheduler entries for all -// pipelines with SLA configs. This ensures warnings/breaches fire even when -// pipelines never trigger (data never arrives, sensor fails, etc.). -// Idempotency: deterministic scheduler names; ConflictException = already exists. -func scheduleSLAAlerts(ctx context.Context, d *Deps) error { - if d.Scheduler == nil { - return nil - } - - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - if cfg.SLA == nil { - continue - } - - // Dry-run pipelines are observation-only — skip SLA scheduling. - if cfg.DryRun { - continue - } - - if isExcluded(cfg, now) { - continue - } - - scheduleID := resolveScheduleID(cfg) - date := resolveWatchdogSLADate(cfg, now) - - // Sensor-triggered daily pipelines run T+1: data for today completes - // tomorrow, so the SLA deadline is relative to tomorrow's date. - // Only slaDate is shifted; the original date is kept for schedule - // naming, trigger lookup, and fire-alert payload so cancellation - // stays consistent with the SFN's view of the pipeline. - slaDate := date - if cfg.Schedule.Cron == "" && !strings.HasPrefix(cfg.SLA.Deadline, ":") { - t, err := time.Parse("2006-01-02", date) - if err == nil { - slaDate = t.AddDate(0, 0, 1).Format("2006-01-02") - } - } - - // Skip if pipeline already completed or permanently failed for this date. - tr, err := d.Store.GetTrigger(ctx, id, scheduleID, date) - switch { - case err != nil: - d.Logger.Warn("trigger lookup failed in SLA scheduling", "pipelineId", id, "error", err) - continue - case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal): - continue - case isJobTerminal(ctx, d, id, scheduleID, date): - continue - } - - calc, err := handleSLACalculate(SLAMonitorInput{ - Mode: "calculate", - PipelineID: id, - ScheduleID: scheduleID, - Date: slaDate, - Deadline: cfg.SLA.Deadline, - ExpectedDuration: cfg.SLA.ExpectedDuration, - Timezone: cfg.SLA.Timezone, - }, now) - if err != nil { - d.Logger.Error("SLA calculate failed", "pipelineId", id, "error", err) - continue - } - - breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) - if breachAt.IsZero() || breachAt.After(now) { - // SLA breach is in the future — create schedules. - scheduleErr := false - if err := createSLASchedules(ctx, d, id, scheduleID, date, calc, true); err != nil { - d.Logger.Error("create SLA schedule failed", - "pipelineId", id, "error", err) - scheduleErr = true - } - - if !scheduleErr { - d.Logger.Info("proactive SLA schedules ensured", - "pipelineId", id, - "date", date, - "warningAt", calc.WarningAt, - "breachAt", calc.BreachAt, - ) - } - } - } - return nil -} - -// checkTriggerDeadlines evaluates trigger deadlines independently of SLA -// configuration. Pipelines with a Trigger.Deadline but no SLA config are -// checked here. For each pipeline, if the trigger deadline has passed and -// no trigger exists, the sensor trigger window is closed. -func checkTriggerDeadlines(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" { - continue - } - - // Dry-run pipelines are observation-only — skip trigger deadline checks. - if cfg.DryRun { - continue - } - - if isExcluded(cfg, now) { - continue - } - - scheduleID := resolveScheduleID(cfg) - triggerDate := resolveTriggerDeadlineDate(cfg, now) - - triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate) - if err != nil { - d.Logger.Warn("trigger lookup failed in deadline check", "pipelineId", id, "error", err) - continue - } - if triggerRec != nil { - continue - } - - if isJobTerminal(ctx, d, id, scheduleID, triggerDate) { - continue - } - - closeSensorTriggerWindow(ctx, d, id, scheduleID, triggerDate, cfg, now) - } - return nil -} - -// resolveWatchdogSLADate determines the execution date for SLA scheduling. -// - Hourly pipelines (relative deadline like ":30"): previous hour composite -// date, e.g. "2026-03-05T13" when the clock is 14:xx. -// - Daily pipelines (absolute deadline like "02:00"): today's date, -// so handleSLACalculate rolls the deadline forward to the next occurrence. -func resolveWatchdogSLADate(cfg *types.PipelineConfig, now time.Time) string { - if strings.HasPrefix(cfg.SLA.Deadline, ":") { - prev := now.Add(-time.Hour) - return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) - } - return now.Format("2006-01-02") -} - -// resolveTriggerDeadlineDate determines the execution date for trigger -// deadline evaluation. Uses the trigger deadline format (not SLA deadline) -// to decide between hourly composite date and daily date. -func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string { - if strings.HasPrefix(cfg.Schedule.Trigger.Deadline, ":") { - prev := now.Add(-time.Hour) - return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) - } - return now.Format("2006-01-02") -} - -// resolveTriggerDeadlineTime computes the absolute time when the trigger -// window closes for the given deadline string and execution date. -// -// For relative (hourly) deadlines like ":45" with composite date "2026-03-09T13": -// - Data for hour 13 is processed in hour 14 -// - The deadline resolves to 2026-03-09T14:45:00 in the configured timezone -// -// For absolute (daily) deadlines like "09:00" with date "2026-03-09": -// - The deadline resolves to 2026-03-09T09:00:00 in the configured timezone -// -// Unlike handleSLACalculate, this does NOT roll forward when the time is past. -// Returns zero time on parse errors. -func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time { - loc := resolveTimezone(timezone) - - if strings.HasPrefix(deadline, ":") { - // Relative (hourly): ":MM" — deadline is in the NEXT hour after the - // composite date's hour, since data for hour H is processed in hour H+1. - minute, err := strconv.Atoi(strings.TrimPrefix(deadline, ":")) - if err != nil { - return time.Time{} - } - // Parse composite date "YYYY-MM-DDThh". - if len(date) < 13 || date[10] != 'T' { - return time.Time{} - } - t, err := time.ParseInLocation("2006-01-02T15", date, loc) - if err != nil { - return time.Time{} - } - // Add 1 hour for the processing window, then set the minute. - return time.Date(t.Year(), t.Month(), t.Day(), t.Hour()+1, minute, 0, 0, loc) - } - - // Absolute (daily): "HH:MM". - parts := strings.SplitN(deadline, ":", 2) - if len(parts) != 2 { - return time.Time{} - } - hour, err := strconv.Atoi(parts[0]) - if err != nil { - return time.Time{} - } - minute, err := strconv.Atoi(parts[1]) - if err != nil { - return time.Time{} - } - t, err := time.ParseInLocation("2006-01-02", date, loc) - if err != nil { - return time.Time{} - } - return time.Date(t.Year(), t.Month(), t.Day(), hour, minute, 0, 0, loc) -} - -// closeSensorTriggerWindow checks whether the trigger deadline has passed for -// a sensor-triggered pipeline that never started. If expired, it writes a -// FAILED_FINAL trigger record (blocking future auto-triggers) and publishes -// a SENSOR_DEADLINE_EXPIRED event. A human can still restart via RERUN_REQUEST. -func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, cfg *types.PipelineConfig, now time.Time) { - // Compute the absolute trigger deadline time directly — we do NOT use - // handleSLACalculate here because it rolls daily deadlines forward 24h - // when past, which defeats the purpose of checking for expiry. - tz := cfg.Schedule.Timezone - if tz == "" && cfg.SLA != nil { - tz = cfg.SLA.Timezone - } - triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz) - if triggerDeadline.IsZero() || triggerDeadline.After(now) { - return - } - - // Use conditional put to avoid overwriting a trigger that was acquired - // between the GetTrigger read and this write (TOCTOU protection). - created, err := d.Store.CreateTriggerIfAbsent(ctx, pipelineID, scheduleID, date, types.TriggerStatusFailedFinal) - if err != nil { - d.Logger.Error("failed to write FAILED_FINAL for expired trigger deadline", - "pipelineId", pipelineID, "schedule", scheduleID, "date", date, "error", err) - return - } - if !created { - // Trigger row appeared since the read — pipeline started, don't interfere. - d.Logger.Info("trigger appeared during deadline check, skipping window close", - "pipelineId", pipelineID, "schedule", scheduleID, "date", date) - return - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "triggerDeadline": cfg.Schedule.Trigger.Deadline, - "actionHint": "auto-trigger window closed — use RERUN_REQUEST to restart", - } - if err := publishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date, - fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID) - } - - d.Logger.Info("sensor trigger window closed", - "pipelineId", pipelineID, - "schedule", scheduleID, - "date", date, - "triggerDeadline", cfg.Schedule.Trigger.Deadline, - ) -} - -// defaultSensorTimeout is the default grace period for post-run sensors to -// arrive after a pipeline completes. If no SensorTimeout is configured in -// PostRunConfig, this value is used. -const defaultSensorTimeout = 2 * time.Hour - -// detectMissingPostRunSensors checks pipelines with PostRun config for missing -// post-run sensor data. If a pipeline completed (COMPLETED trigger + baseline -// exists) but no post-run sensor matching a rule key has been updated since -// completion, and the SensorTimeout grace period has elapsed, a -// POST_RUN_SENSOR_MISSING event is published. -func detectMissingPostRunSensors(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - today := now.Format("2006-01-02") - - for id, cfg := range configs { - if cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 { - continue - } - - // Dry-run pipelines are observation-only — skip post-run sensor checks. - if cfg.DryRun { - continue - } - - scheduleID := resolveScheduleID(cfg) - - // Only check pipelines with a COMPLETED trigger for today. - tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today) - if err != nil { - d.Logger.Error("trigger lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - if tr == nil || tr.Status != types.TriggerStatusCompleted { - continue - } - - // Baseline must exist — it signals that capturePostRunBaseline ran - // at completion time. - baselineKey := "postrun-baseline#" + today - baseline, err := d.Store.GetSensorData(ctx, id, baselineKey) - if err != nil { - d.Logger.Error("baseline lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - if baseline == nil { - continue - } - - // Dedup: skip if we already published an alert for this date. - dedupKey := "postrun-check#" + today - dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) - if err != nil { - d.Logger.Error("dedup marker lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - if dedupData != nil { - continue - } - - // Determine the completion timestamp from the latest success job event. - completionTime, err := resolveCompletionTime(ctx, d, id, scheduleID, today) - if err != nil { - d.Logger.Error("completion time resolution failed", - "pipelineId", id, "error", err) - continue - } - if completionTime.IsZero() { - continue - } - - // Parse SensorTimeout from config (default 2h). - timeout := parseSensorTimeout(cfg.PostRun.SensorTimeout) - - // Check if the timeout has elapsed since completion. - if now.Before(completionTime.Add(timeout)) { - continue - } - - // Check if any post-run rule sensor has been updated since completion. - sensors, err := d.Store.GetAllSensors(ctx, id) - if err != nil { - d.Logger.Error("sensor lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - - if hasPostRunSensorUpdate(cfg.PostRun.Rules, sensors, completionTime) { - continue - } - - // No post-run sensor has arrived within the grace period — publish event. - ruleKeys := make([]string, 0, len(cfg.PostRun.Rules)) - for _, r := range cfg.PostRun.Rules { - ruleKeys = append(ruleKeys, r.Key) - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "sensorTimeout": cfg.PostRun.SensorTimeout, - "ruleKeys": strings.Join(ruleKeys, ", "), - "actionHint": "post-run sensor data has not arrived within the expected timeout", - } - if err := publishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today, - fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil { - d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) - } - - // Write dedup marker to avoid re-alerting on subsequent watchdog runs. - if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ - "alerted": "true", - }); err != nil { - d.Logger.Warn("failed to write post-run dedup marker", "error", err, "pipeline", id, "date", today) - } - - d.Logger.Info("detected missing post-run sensor", - "pipelineId", id, - "schedule", scheduleID, - "date", today, - ) - } - return nil -} - -// resolveCompletionTime extracts the completion timestamp from the latest -// success job event for the given pipeline/schedule/date. The job event SK -// has the format JOB### where timestamp is -// milliseconds since epoch. -func resolveCompletionTime(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) (time.Time, error) { - rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date) - if err != nil { - return time.Time{}, fmt.Errorf("get latest job event: %w", err) - } - if rec == nil { - return time.Time{}, nil - } - if rec.Event != types.JobEventSuccess { - return time.Time{}, nil - } - - // Extract timestamp from SK: JOB### - parts := strings.Split(rec.SK, "#") - if len(parts) < 4 { - return time.Time{}, fmt.Errorf("unexpected job SK format: %q", rec.SK) - } - tsMillis, err := strconv.ParseInt(parts[len(parts)-1], 10, 64) - if err != nil { - return time.Time{}, fmt.Errorf("parse job timestamp %q: %w", parts[len(parts)-1], err) - } - return time.UnixMilli(tsMillis), nil -} - -// parseSensorTimeout parses a duration string from PostRunConfig.SensorTimeout. -// Returns defaultSensorTimeout (2h) if the string is empty or unparseable. -func parseSensorTimeout(s string) time.Duration { - if s == "" { - return defaultSensorTimeout - } - d, err := time.ParseDuration(s) - if err != nil { - return defaultSensorTimeout - } - return d -} - -// hasPostRunSensorUpdate checks whether any sensor matching a PostRun rule key -// has an updatedAt timestamp newer than the given completion time. -func hasPostRunSensorUpdate(rules []types.ValidationRule, sensors map[string]map[string]interface{}, completionTime time.Time) bool { - completionMillis := completionTime.UnixMilli() - - for _, rule := range rules { - data, ok := sensors[rule.Key] - if !ok { - continue - } - - updatedAt, ok := data["updatedAt"] - if !ok { - continue - } - - var ts int64 - switch v := updatedAt.(type) { - case float64: - ts = int64(v) - case int64: - ts = v - case string: - ts, _ = strconv.ParseInt(v, 10, 64) - default: - continue - } - - if ts > completionMillis { - return true - } - } - return false -} - -// detectRelativeSLABreaches checks pipelines with MaxDuration SLA config for -// breaches. This is a defense-in-depth fallback: if the EventBridge Scheduler -// fails to fire the relative SLA breach alert, the watchdog catches it. -// -// Both today and yesterday are checked because stream_router writes the -// first-sensor-arrival key using ResolveExecutionDate(), which for T+1 -// sensor-triggered pipelines produces yesterday's date. Checking both dates -// covers the cross-day boundary. -func detectRelativeSLABreaches(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - datesToCheck := []string{ - now.Format("2006-01-02"), - now.AddDate(0, 0, -1).Format("2006-01-02"), - } - - for id, cfg := range configs { - if cfg.SLA == nil || cfg.SLA.MaxDuration == "" { - continue - } - - // Dry-run pipelines are observation-only — skip relative SLA checks. - if cfg.DryRun { - continue - } - - maxDur, err := time.ParseDuration(cfg.SLA.MaxDuration) - if err != nil { - d.Logger.Warn("invalid maxDuration in SLA config", - "pipelineId", id, "maxDuration", cfg.SLA.MaxDuration, "error", err) - continue - } - - scheduleID := resolveScheduleID(cfg) - - for _, checkDate := range datesToCheck { - checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now) - } - } - return nil -} - -// checkRelativeSLAForDate checks a single date for a relative SLA breach on -// the given pipeline. It looks up the first-sensor-arrival marker, verifies -// the breach window has elapsed, and publishes an alert if needed. -func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types.PipelineConfig, scheduleID, checkDate string, maxDur time.Duration, now time.Time) { - arrivalKey := "first-sensor-arrival#" + checkDate - arrivalData, err := d.Store.GetSensorData(ctx, id, arrivalKey) - if err != nil { - d.Logger.Error("first-sensor-arrival lookup failed", - "pipelineId", id, "date", checkDate, "error", err) - return - } - if arrivalData == nil { - return - } - - arrivedAtStr, ok := arrivalData["arrivedAt"].(string) - if !ok || arrivedAtStr == "" { - return - } - arrivedAt, err := time.Parse(time.RFC3339, arrivedAtStr) - if err != nil { - d.Logger.Warn("invalid arrivedAt in first-sensor-arrival", - "pipelineId", id, "arrivedAt", arrivedAtStr, "error", err) - return - } - - // Check if the relative SLA has been breached. - breachAt := arrivedAt.Add(maxDur) - if now.Before(breachAt) { - return - } - - // Skip if pipeline already completed or permanently failed. - tr, err := d.Store.GetTrigger(ctx, id, scheduleID, checkDate) - if err != nil { - d.Logger.Warn("trigger lookup failed in relative SLA check", - "pipelineId", id, "date", checkDate, "error", err) - return - } - if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) { - return - } - if isJobTerminal(ctx, d, id, scheduleID, checkDate) { - return - } - - // Check dedup marker to avoid re-alerting on subsequent watchdog runs. - // The dedup key includes checkDate to avoid cross-date collisions. - dedupKey := "relative-sla-breach-check#" + checkDate - dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) - if err != nil { - d.Logger.Error("dedup marker lookup failed for relative SLA breach", - "pipelineId", id, "date", checkDate, "error", err) - return - } - if dedupData != nil { - return - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "maxDuration": cfg.SLA.MaxDuration, - "sensorArrivalAt": arrivedAtStr, - "breachAt": breachAt.UTC().Format(time.RFC3339), - "actionHint": "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival", - } - if err := publishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate, - fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil { - d.Logger.Warn("failed to publish relative SLA breach event", - "error", err, "pipeline", id, "date", checkDate) - } - - // Write dedup marker. - if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ - "alerted": "true", - }); err != nil { - d.Logger.Warn("failed to write relative SLA breach dedup marker", - "error", err, "pipeline", id, "date", checkDate) - } - - d.Logger.Info("detected relative SLA breach", - "pipelineId", id, - "schedule", scheduleID, - "date", checkDate, - "sensorArrivalAt", arrivedAtStr, - "breachAt", breachAt.UTC().Format(time.RFC3339), - ) -} diff --git a/internal/lambda/watchdog_missed.go b/internal/lambda/watchdog_missed.go new file mode 100644 index 0000000..cd94130 --- /dev/null +++ b/internal/lambda/watchdog_missed.go @@ -0,0 +1,237 @@ +package lambda + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// lastCronFire returns the most recent expected fire time for a cron expression. +// Supports the minute-hour patterns used by this system: "MM * * * *" (hourly) +// and "MM HH * * *" (daily). Returns zero time for unsupported patterns. +func lastCronFire(cron string, now time.Time, loc *time.Location) time.Time { + fields := strings.Fields(cron) + if len(fields) < 5 { + return time.Time{} + } + minute, err := strconv.Atoi(fields[0]) + if err != nil { + return time.Time{} + } + localNow := now.In(loc) + + if fields[1] == "*" { + // Hourly: fires at :MM every hour. + candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), + localNow.Hour(), minute, 0, 0, loc) + if candidate.After(localNow) { + candidate = candidate.Add(-time.Hour) + } + return candidate + } + + hour, err := strconv.Atoi(fields[1]) + if err != nil { + return time.Time{} + } + // Daily: fires at HH:MM every day. + candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), + hour, minute, 0, 0, loc) + if candidate.After(localNow) { + candidate = candidate.Add(-24 * time.Hour) + } + return candidate +} + +// detectMissedSchedules checks all cron-scheduled pipelines to see if today's +// trigger is missing. If a pipeline should have started by now but has no +// TRIGGER# row, a SCHEDULE_MISSED event is published. +func detectMissedSchedules(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + today := now.Format("2006-01-02") + + for id, cfg := range configs { + // Only check cron-scheduled pipelines. + if cfg.Schedule.Cron == "" { + continue + } + + // Dry-run pipelines are observation-only — skip missed schedule detection. + if cfg.DryRun { + continue + } + + // Skip calendar-excluded days. + if isExcluded(cfg, now) { + continue + } + + // Only alert for schedules that should have fired after this Lambda + // started. Prevents retroactive alerts after fresh deploys. + if !d.StartedAt.IsZero() { + loc := resolveTimezone(cfg.Schedule.Timezone) + if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) { + continue + } + } + + // Resolve schedule ID for cron pipelines. + scheduleID := resolveScheduleID(cfg) + + // Check if any TRIGGER# row exists for today (covers both daily + // and per-hour trigger rows, e.g. "2026-03-04" and "2026-03-04T00"). + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, today) + if err != nil { + d.Logger.Error("failed to check trigger for missed schedule", + "pipelineId", id, "error", err) + continue + } + if found { + continue + } + + // Check if we are past the expected start time. If the pipeline + // has a schedule time configured, only alert after that time. + if cfg.Schedule.Time != "" { + loc := resolveTimezone(cfg.Schedule.Timezone) + localNow := now.In(loc) + expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc) + if err == nil && localNow.Before(expectedStart) { + continue // not yet past expected start time + } + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "cron": cfg.Schedule.Cron, + "actionHint": fmt.Sprintf("cron %s expected to fire — no trigger found", cfg.Schedule.Cron), + } + if cfg.Schedule.Time != "" { + alertDetail["expectedTime"] = cfg.Schedule.Time + } + if err := publishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today, + fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil { + d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) + } + + d.Logger.Info("detected missed schedule", + "pipelineId", id, + "schedule", scheduleID, + "date", today, + ) + } + return nil +} + +// detectMissedInclusionSchedules checks pipelines with inclusion calendar config +// for missed schedules on irregular dates. For each pipeline with an Include +// config, it finds all past inclusion dates (capped at maxInclusionLookback) +// and verifies that a trigger exists for each. If no trigger is found and no +// dedup marker exists, an IRREGULAR_SCHEDULE_MISSED event is published. +func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 { + continue + } + + // Dry-run pipelines are observation-only — skip inclusion schedule detection. + if cfg.DryRun { + continue + } + + // Skip calendar-excluded days. + if isExcluded(cfg, now) { + continue + } + + pastDates := PastInclusionDates(cfg.Schedule.Include.Dates, now) + if len(pastDates) == 0 { + continue + } + + scheduleID := resolveScheduleID(cfg) + + // Resolve today in the pipeline's timezone so the grace-period + // guard fires correctly when UTC date != pipeline-local date. + tzLoc := resolveTimezone(cfg.Schedule.Timezone) + today := now.In(tzLoc).Format("2006-01-02") + + for _, date := range pastDates { + // If the inclusion date is today and the pipeline has a + // Schedule.Time, only alert after that time has passed. + // This mirrors the same check in detectMissedSchedules for + // cron pipelines to avoid false-positive alerts before the + // expected start time. Past dates are not gated because + // their Schedule.Time has necessarily already elapsed. + if cfg.Schedule.Time != "" && date == today { + localNow := now.In(tzLoc) + expectedStart, err := time.ParseInLocation("2006-01-02 15:04", date+" "+cfg.Schedule.Time, tzLoc) + if err == nil && localNow.Before(expectedStart) { + continue // not yet past expected start time + } + } + + // Check if a trigger exists for this inclusion date. + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) + if err != nil { + d.Logger.Error("failed to check trigger for inclusion schedule", + "pipelineId", id, "date", date, "error", err) + continue + } + if found { + continue + } + + // Check dedup marker to avoid re-alerting on subsequent watchdog runs. + dedupKey := "irregular-missed-check#" + date + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed for inclusion schedule", + "pipelineId", id, "date", date, "error", err) + continue + } + if dedupData != nil { + continue + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date), + } + if err := publishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date, + fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date) + } + + // Write dedup marker. + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write inclusion dedup marker", "error", err, "pipeline", id, "date", date) + } + + d.Logger.Info("detected missed inclusion schedule", + "pipelineId", id, + "schedule", scheduleID, + "date", date, + ) + } + } + return nil +} diff --git a/internal/lambda/watchdog_postrun.go b/internal/lambda/watchdog_postrun.go new file mode 100644 index 0000000..677ac10 --- /dev/null +++ b/internal/lambda/watchdog_postrun.go @@ -0,0 +1,353 @@ +package lambda + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// defaultSensorTimeout is the default grace period for post-run sensors to +// arrive after a pipeline completes. If no SensorTimeout is configured in +// PostRunConfig, this value is used. +const defaultSensorTimeout = 2 * time.Hour + +// detectMissingPostRunSensors checks pipelines with PostRun config for missing +// post-run sensor data. If a pipeline completed (COMPLETED trigger + baseline +// exists) but no post-run sensor matching a rule key has been updated since +// completion, and the SensorTimeout grace period has elapsed, a +// POST_RUN_SENSOR_MISSING event is published. +func detectMissingPostRunSensors(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + today := now.Format("2006-01-02") + + for id, cfg := range configs { + if cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 { + continue + } + + // Dry-run pipelines are observation-only — skip post-run sensor checks. + if cfg.DryRun { + continue + } + + scheduleID := resolveScheduleID(cfg) + + // Only check pipelines with a COMPLETED trigger for today. + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today) + if err != nil { + d.Logger.Error("trigger lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if tr == nil || tr.Status != types.TriggerStatusCompleted { + continue + } + + // Baseline must exist — it signals that capturePostRunBaseline ran + // at completion time. + baselineKey := "postrun-baseline#" + today + baseline, err := d.Store.GetSensorData(ctx, id, baselineKey) + if err != nil { + d.Logger.Error("baseline lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if baseline == nil { + continue + } + + // Dedup: skip if we already published an alert for this date. + dedupKey := "postrun-check#" + today + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if dedupData != nil { + continue + } + + // Determine the completion timestamp from the latest success job event. + completionTime, err := resolveCompletionTime(ctx, d, id, scheduleID, today) + if err != nil { + d.Logger.Error("completion time resolution failed", + "pipelineId", id, "error", err) + continue + } + if completionTime.IsZero() { + continue + } + + // Parse SensorTimeout from config (default 2h). + timeout := parseSensorTimeout(cfg.PostRun.SensorTimeout) + + // Check if the timeout has elapsed since completion. + if now.Before(completionTime.Add(timeout)) { + continue + } + + // Check if any post-run rule sensor has been updated since completion. + sensors, err := d.Store.GetAllSensors(ctx, id) + if err != nil { + d.Logger.Error("sensor lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + + if hasPostRunSensorUpdate(cfg.PostRun.Rules, sensors, completionTime) { + continue + } + + // No post-run sensor has arrived within the grace period — publish event. + ruleKeys := make([]string, 0, len(cfg.PostRun.Rules)) + for _, r := range cfg.PostRun.Rules { + ruleKeys = append(ruleKeys, r.Key) + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "sensorTimeout": cfg.PostRun.SensorTimeout, + "ruleKeys": strings.Join(ruleKeys, ", "), + "actionHint": "post-run sensor data has not arrived within the expected timeout", + } + if err := publishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today, + fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil { + d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) + } + + // Write dedup marker to avoid re-alerting on subsequent watchdog runs. + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write post-run dedup marker", "error", err, "pipeline", id, "date", today) + } + + d.Logger.Info("detected missing post-run sensor", + "pipelineId", id, + "schedule", scheduleID, + "date", today, + ) + } + return nil +} + +// resolveCompletionTime extracts the completion timestamp from the latest +// success job event for the given pipeline/schedule/date. The job event SK +// has the format JOB### where timestamp is +// milliseconds since epoch. +func resolveCompletionTime(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) (time.Time, error) { + rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date) + if err != nil { + return time.Time{}, fmt.Errorf("get latest job event: %w", err) + } + if rec == nil { + return time.Time{}, nil + } + if rec.Event != types.JobEventSuccess { + return time.Time{}, nil + } + + // Extract timestamp from SK: JOB### + parts := strings.Split(rec.SK, "#") + if len(parts) < 4 { + return time.Time{}, fmt.Errorf("unexpected job SK format: %q", rec.SK) + } + tsMillis, err := strconv.ParseInt(parts[len(parts)-1], 10, 64) + if err != nil { + return time.Time{}, fmt.Errorf("parse job timestamp %q: %w", parts[len(parts)-1], err) + } + return time.UnixMilli(tsMillis), nil +} + +// parseSensorTimeout parses a duration string from PostRunConfig.SensorTimeout. +// Returns defaultSensorTimeout (2h) if the string is empty or unparseable. +func parseSensorTimeout(s string) time.Duration { + if s == "" { + return defaultSensorTimeout + } + d, err := time.ParseDuration(s) + if err != nil { + return defaultSensorTimeout + } + return d +} + +// hasPostRunSensorUpdate checks whether any sensor matching a PostRun rule key +// has an updatedAt timestamp newer than the given completion time. +func hasPostRunSensorUpdate(rules []types.ValidationRule, sensors map[string]map[string]interface{}, completionTime time.Time) bool { + completionMillis := completionTime.UnixMilli() + + for _, rule := range rules { + data, ok := sensors[rule.Key] + if !ok { + continue + } + + updatedAt, ok := data["updatedAt"] + if !ok { + continue + } + + var ts int64 + switch v := updatedAt.(type) { + case float64: + ts = int64(v) + case int64: + ts = v + case string: + ts, _ = strconv.ParseInt(v, 10, 64) + default: + continue + } + + if ts > completionMillis { + return true + } + } + return false +} + +// detectRelativeSLABreaches checks pipelines with MaxDuration SLA config for +// breaches. This is a defense-in-depth fallback: if the EventBridge Scheduler +// fails to fire the relative SLA breach alert, the watchdog catches it. +// +// Both today and yesterday are checked because stream_router writes the +// first-sensor-arrival key using ResolveExecutionDate(), which for T+1 +// sensor-triggered pipelines produces yesterday's date. Checking both dates +// covers the cross-day boundary. +func detectRelativeSLABreaches(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + datesToCheck := []string{ + now.Format("2006-01-02"), + now.AddDate(0, 0, -1).Format("2006-01-02"), + } + + for id, cfg := range configs { + if cfg.SLA == nil || cfg.SLA.MaxDuration == "" { + continue + } + + // Dry-run pipelines are observation-only — skip relative SLA checks. + if cfg.DryRun { + continue + } + + maxDur, err := time.ParseDuration(cfg.SLA.MaxDuration) + if err != nil { + d.Logger.Warn("invalid maxDuration in SLA config", + "pipelineId", id, "maxDuration", cfg.SLA.MaxDuration, "error", err) + continue + } + + scheduleID := resolveScheduleID(cfg) + + for _, checkDate := range datesToCheck { + checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now) + } + } + return nil +} + +// checkRelativeSLAForDate checks a single date for a relative SLA breach on +// the given pipeline. It looks up the first-sensor-arrival marker, verifies +// the breach window has elapsed, and publishes an alert if needed. +func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types.PipelineConfig, scheduleID, checkDate string, maxDur time.Duration, now time.Time) { + arrivalKey := "first-sensor-arrival#" + checkDate + arrivalData, err := d.Store.GetSensorData(ctx, id, arrivalKey) + if err != nil { + d.Logger.Error("first-sensor-arrival lookup failed", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if arrivalData == nil { + return + } + + arrivedAtStr, ok := arrivalData["arrivedAt"].(string) + if !ok || arrivedAtStr == "" { + return + } + arrivedAt, err := time.Parse(time.RFC3339, arrivedAtStr) + if err != nil { + d.Logger.Warn("invalid arrivedAt in first-sensor-arrival", + "pipelineId", id, "arrivedAt", arrivedAtStr, "error", err) + return + } + + // Check if the relative SLA has been breached. + breachAt := arrivedAt.Add(maxDur) + if now.Before(breachAt) { + return + } + + // Skip if pipeline already completed or permanently failed. + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, checkDate) + if err != nil { + d.Logger.Warn("trigger lookup failed in relative SLA check", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) { + return + } + if isJobTerminal(ctx, d, id, scheduleID, checkDate) { + return + } + + // Check dedup marker to avoid re-alerting on subsequent watchdog runs. + // The dedup key includes checkDate to avoid cross-date collisions. + dedupKey := "relative-sla-breach-check#" + checkDate + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed for relative SLA breach", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if dedupData != nil { + return + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "maxDuration": cfg.SLA.MaxDuration, + "sensorArrivalAt": arrivedAtStr, + "breachAt": breachAt.UTC().Format(time.RFC3339), + "actionHint": "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival", + } + if err := publishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate, + fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil { + d.Logger.Warn("failed to publish relative SLA breach event", + "error", err, "pipeline", id, "date", checkDate) + } + + // Write dedup marker. + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write relative SLA breach dedup marker", + "error", err, "pipeline", id, "date", checkDate) + } + + d.Logger.Info("detected relative SLA breach", + "pipelineId", id, + "schedule", scheduleID, + "date", checkDate, + "sensorArrivalAt", arrivedAtStr, + "breachAt", breachAt.UTC().Format(time.RFC3339), + ) +} diff --git a/internal/lambda/watchdog_sla.go b/internal/lambda/watchdog_sla.go new file mode 100644 index 0000000..7eab64e --- /dev/null +++ b/internal/lambda/watchdog_sla.go @@ -0,0 +1,281 @@ +package lambda + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// scheduleSLAAlerts proactively creates EventBridge Scheduler entries for all +// pipelines with SLA configs. This ensures warnings/breaches fire even when +// pipelines never trigger (data never arrives, sensor fails, etc.). +// Idempotency: deterministic scheduler names; ConflictException = already exists. +func scheduleSLAAlerts(ctx context.Context, d *Deps) error { + if d.Scheduler == nil { + return nil + } + + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + if cfg.SLA == nil { + continue + } + + // Dry-run pipelines are observation-only — skip SLA scheduling. + if cfg.DryRun { + continue + } + + if isExcluded(cfg, now) { + continue + } + + scheduleID := resolveScheduleID(cfg) + date := resolveWatchdogSLADate(cfg, now) + + // Sensor-triggered daily pipelines run T+1: data for today completes + // tomorrow, so the SLA deadline is relative to tomorrow's date. + // Only slaDate is shifted; the original date is kept for schedule + // naming, trigger lookup, and fire-alert payload so cancellation + // stays consistent with the SFN's view of the pipeline. + slaDate := date + if cfg.Schedule.Cron == "" && !strings.HasPrefix(cfg.SLA.Deadline, ":") { + t, err := time.Parse("2006-01-02", date) + if err == nil { + slaDate = t.AddDate(0, 0, 1).Format("2006-01-02") + } + } + + // Skip if pipeline already completed or permanently failed for this date. + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, date) + switch { + case err != nil: + d.Logger.Warn("trigger lookup failed in SLA scheduling", "pipelineId", id, "error", err) + continue + case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal): + continue + case isJobTerminal(ctx, d, id, scheduleID, date): + continue + } + + calc, err := handleSLACalculate(SLAMonitorInput{ + Mode: "calculate", + PipelineID: id, + ScheduleID: scheduleID, + Date: slaDate, + Deadline: cfg.SLA.Deadline, + ExpectedDuration: cfg.SLA.ExpectedDuration, + Timezone: cfg.SLA.Timezone, + }, now) + if err != nil { + d.Logger.Error("SLA calculate failed", "pipelineId", id, "error", err) + continue + } + + breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) + if breachAt.IsZero() || breachAt.After(now) { + // SLA breach is in the future — create schedules. + scheduleErr := false + if err := createSLASchedules(ctx, d, id, scheduleID, date, calc, true); err != nil { + d.Logger.Error("create SLA schedule failed", + "pipelineId", id, "error", err) + scheduleErr = true + } + + if !scheduleErr { + d.Logger.Info("proactive SLA schedules ensured", + "pipelineId", id, + "date", date, + "warningAt", calc.WarningAt, + "breachAt", calc.BreachAt, + ) + } + } + } + return nil +} + +// checkTriggerDeadlines evaluates trigger deadlines independently of SLA +// configuration. Pipelines with a Trigger.Deadline but no SLA config are +// checked here. For each pipeline, if the trigger deadline has passed and +// no trigger exists, the sensor trigger window is closed. +func checkTriggerDeadlines(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" { + continue + } + + // Dry-run pipelines are observation-only — skip trigger deadline checks. + if cfg.DryRun { + continue + } + + if isExcluded(cfg, now) { + continue + } + + scheduleID := resolveScheduleID(cfg) + triggerDate := resolveTriggerDeadlineDate(cfg, now) + + triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate) + if err != nil { + d.Logger.Warn("trigger lookup failed in deadline check", "pipelineId", id, "error", err) + continue + } + if triggerRec != nil { + continue + } + + if isJobTerminal(ctx, d, id, scheduleID, triggerDate) { + continue + } + + closeSensorTriggerWindow(ctx, d, id, scheduleID, triggerDate, cfg, now) + } + return nil +} + +// resolveWatchdogSLADate determines the execution date for SLA scheduling. +// - Hourly pipelines (relative deadline like ":30"): previous hour composite +// date, e.g. "2026-03-05T13" when the clock is 14:xx. +// - Daily pipelines (absolute deadline like "02:00"): today's date, +// so handleSLACalculate rolls the deadline forward to the next occurrence. +func resolveWatchdogSLADate(cfg *types.PipelineConfig, now time.Time) string { + if strings.HasPrefix(cfg.SLA.Deadline, ":") { + prev := now.Add(-time.Hour) + return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) + } + return now.Format("2006-01-02") +} + +// resolveTriggerDeadlineDate determines the execution date for trigger +// deadline evaluation. Uses the trigger deadline format (not SLA deadline) +// to decide between hourly composite date and daily date. +func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string { + if strings.HasPrefix(cfg.Schedule.Trigger.Deadline, ":") { + prev := now.Add(-time.Hour) + return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) + } + return now.Format("2006-01-02") +} + +// resolveTriggerDeadlineTime computes the absolute time when the trigger +// window closes for the given deadline string and execution date. +// +// For relative (hourly) deadlines like ":45" with composite date "2026-03-09T13": +// - Data for hour 13 is processed in hour 14 +// - The deadline resolves to 2026-03-09T14:45:00 in the configured timezone +// +// For absolute (daily) deadlines like "09:00" with date "2026-03-09": +// - The deadline resolves to 2026-03-09T09:00:00 in the configured timezone +// +// Unlike handleSLACalculate, this does NOT roll forward when the time is past. +// Returns zero time on parse errors. +func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time { + loc := resolveTimezone(timezone) + + if strings.HasPrefix(deadline, ":") { + // Relative (hourly): ":MM" — deadline is in the NEXT hour after the + // composite date's hour, since data for hour H is processed in hour H+1. + minute, err := strconv.Atoi(strings.TrimPrefix(deadline, ":")) + if err != nil { + return time.Time{} + } + // Parse composite date "YYYY-MM-DDThh". + if len(date) < 13 || date[10] != 'T' { + return time.Time{} + } + t, err := time.ParseInLocation("2006-01-02T15", date, loc) + if err != nil { + return time.Time{} + } + // Add 1 hour for the processing window, then set the minute. + return time.Date(t.Year(), t.Month(), t.Day(), t.Hour()+1, minute, 0, 0, loc) + } + + // Absolute (daily): "HH:MM". + parts := strings.SplitN(deadline, ":", 2) + if len(parts) != 2 { + return time.Time{} + } + hour, err := strconv.Atoi(parts[0]) + if err != nil { + return time.Time{} + } + minute, err := strconv.Atoi(parts[1]) + if err != nil { + return time.Time{} + } + t, err := time.ParseInLocation("2006-01-02", date, loc) + if err != nil { + return time.Time{} + } + return time.Date(t.Year(), t.Month(), t.Day(), hour, minute, 0, 0, loc) +} + +// closeSensorTriggerWindow checks whether the trigger deadline has passed for +// a sensor-triggered pipeline that never started. If expired, it writes a +// FAILED_FINAL trigger record (blocking future auto-triggers) and publishes +// a SENSOR_DEADLINE_EXPIRED event. A human can still restart via RERUN_REQUEST. +func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, cfg *types.PipelineConfig, now time.Time) { + // Compute the absolute trigger deadline time directly — we do NOT use + // handleSLACalculate here because it rolls daily deadlines forward 24h + // when past, which defeats the purpose of checking for expiry. + tz := cfg.Schedule.Timezone + if tz == "" && cfg.SLA != nil { + tz = cfg.SLA.Timezone + } + triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz) + if triggerDeadline.IsZero() || triggerDeadline.After(now) { + return + } + + // Use conditional put to avoid overwriting a trigger that was acquired + // between the GetTrigger read and this write (TOCTOU protection). + created, err := d.Store.CreateTriggerIfAbsent(ctx, pipelineID, scheduleID, date, types.TriggerStatusFailedFinal) + if err != nil { + d.Logger.Error("failed to write FAILED_FINAL for expired trigger deadline", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date, "error", err) + return + } + if !created { + // Trigger row appeared since the read — pipeline started, don't interfere. + d.Logger.Info("trigger appeared during deadline check, skipping window close", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date) + return + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "triggerDeadline": cfg.Schedule.Trigger.Deadline, + "actionHint": "auto-trigger window closed — use RERUN_REQUEST to restart", + } + if err := publishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date, + fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID) + } + + d.Logger.Info("sensor trigger window closed", + "pipelineId", pipelineID, + "schedule", scheduleID, + "date", date, + "triggerDeadline", cfg.Schedule.Trigger.Deadline, + ) +} diff --git a/internal/lambda/watchdog_stale.go b/internal/lambda/watchdog_stale.go new file mode 100644 index 0000000..cebfb57 --- /dev/null +++ b/internal/lambda/watchdog_stale.go @@ -0,0 +1,209 @@ +package lambda + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/dwsmith1983/interlock/internal/validation" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and +// publishes an SFN_TIMEOUT event for any that have exceeded their TTL or the +// staleTriggerThreshold. Stale triggers are moved to FAILED_FINAL status. +func detectStaleTriggers(ctx context.Context, d *Deps) error { + triggers, err := d.Store.ScanRunningTriggers(ctx) + if err != nil { + return fmt.Errorf("scan running triggers: %w", err) + } + + now := d.now() + for _, tr := range triggers { + if !isStaleTrigger(tr, now) { + continue + } + + pipelineID, schedule, date, err := parseTriggerRecord(tr) + if err != nil { + d.Logger.Warn("skipping unparseable trigger", "pk", tr.PK, "sk", tr.SK, "error", err) + continue + } + + // Dry-run pipelines should never have TRIGGER# rows, but guard + // against stale rows from pre-dry-run migrations or bugs. + if cfg, cfgErr := d.ConfigCache.Get(ctx, pipelineID); cfgErr == nil && cfg != nil && cfg.DryRun { + continue + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "actionHint": "step function exceeded TTL — check SFN execution history", + } + if tr.TTL > 0 { + alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339) + } + if err := publishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date, + fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + + if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { + d.Logger.Error("failed to set trigger status to FAILED_FINAL", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "error", err) + continue + } + + d.Logger.Info("detected stale trigger", + "pipelineId", pipelineID, + "schedule", schedule, + "date", date, + ) + } + return nil +} + +// isStaleTrigger returns true if the trigger's TTL has expired or if the TTL +// is zero and the trigger has been running longer than staleTriggerThreshold. +func isStaleTrigger(tr types.ControlRecord, now time.Time) bool { + if tr.TTL > 0 { + return now.Unix() > tr.TTL + } + // No TTL set — treat as stale if it has existed for longer than the threshold. + // Without a creation timestamp we can't be precise, so we conservatively + // consider it stale only when TTL is explicitly expired. + return false +} + +// parseTriggerRecord extracts pipeline ID, schedule, and date from a trigger +// ControlRecord's PK and SK. +// PK format: PIPELINE# +// SK format: TRIGGER## +func parseTriggerRecord(tr types.ControlRecord) (pipelineID, schedule, date string, err error) { + const pkPrefix = "PIPELINE#" + if !strings.HasPrefix(tr.PK, pkPrefix) { + return "", "", "", fmt.Errorf("unexpected PK format: %q", tr.PK) + } + pipelineID = tr.PK[len(pkPrefix):] + + const skPrefix = "TRIGGER#" + trimmed := strings.TrimPrefix(tr.SK, skPrefix) + if trimmed == tr.SK { + return "", "", "", fmt.Errorf("unexpected SK format: %q", tr.SK) + } + parts := strings.SplitN(trimmed, "#", 2) + if len(parts) != 2 { + return "", "", "", fmt.Errorf("invalid TRIGGER SK format: %q", tr.SK) + } + return pipelineID, parts[0], parts[1], nil +} + +// reconcileSensorTriggers re-evaluates trigger conditions for sensor-triggered +// pipelines. If a sensor meets the trigger condition but no trigger lock exists, +// the watchdog acquires the lock, starts the SFN, and publishes TRIGGER_RECOVERED. +// This self-heals missed triggers caused by silent completion-write failures. +func reconcileSensorTriggers(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + trigger := cfg.Schedule.Trigger + if trigger == nil || cfg.Schedule.Cron != "" { + continue + } + + // Dry-run pipelines are observation-only — skip reconciliation. + if cfg.DryRun { + continue + } + + if isExcluded(cfg, now) { + continue + } + + sensors, err := d.Store.GetAllSensors(ctx, id) + if err != nil { + d.Logger.Error("failed to get sensors for reconciliation", + "pipelineId", id, "error", err) + continue + } + + scheduleID := resolveScheduleID(cfg) + + for sensorKey, sensorData := range sensors { + if !strings.HasPrefix(sensorKey, trigger.Key) { + continue + } + + rule := types.ValidationRule{ + Key: trigger.Key, + Check: trigger.Check, + Field: trigger.Field, + Value: trigger.Value, + } + result := validation.EvaluateRule(rule, sensorData, now) + if !result.Passed { + continue + } + + date := ResolveExecutionDate(sensorData, now) + + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) + if err != nil { + d.Logger.Error("trigger check failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + if found { + continue + } + + // Guard against re-triggering completed pipelines whose trigger + // record was deleted by DynamoDB TTL. Check the joblog for a + // terminal event before acquiring a new lock. + if isJobTerminal(ctx, d, id, scheduleID, date) { + continue + } + + acquired, err := d.Store.AcquireTriggerLock(ctx, id, scheduleID, date, ResolveTriggerLockTTL()) + if err != nil { + d.Logger.Error("lock acquisition failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + if !acquired { + continue + } + + if err := startSFN(ctx, d, cfg, id, scheduleID, date); err != nil { + if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil { + d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr) + } + d.Logger.Error("SFN start failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + + alertDetail := map[string]interface{}{ + "source": "reconciliation", + "actionHint": "watchdog recovered missed sensor trigger", + } + if err := publishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date, + fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date) + } + + d.Logger.Info("recovered missed trigger", + "pipelineId", id, + "schedule", scheduleID, + "date", date, + ) + } + } + return nil +} From 2c59bd51560e5f6c82cd3597104df92af84ee884 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:13:58 +0700 Subject: [PATCH 12/17] docs: CHANGELOG for v0.9.3 restructuring --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf8b545..f796f43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.3] - 2026-03-13 + +### Changed + +- **Shared HTTP client construction (DRY-2)** — Extracted `resolveHTTPClient()` replacing identical 7-line blocks in `ExecuteHTTP` and `ExecuteAirflow`. +- **Shared SLA schedule creation loop (DRY-3)** — Extracted `createSLASchedules()` replacing duplicated warning/breach schedule loops in watchdog and sla-monitor. +- **Split watchdog.go into focused files** — 1079-line monolith split into 5 files by domain: stale triggers, missed schedules, SLA alerting, and post-run monitoring (~200 lines each). + +### Security + +- **Command trigger shell injection eliminated (SEC-3)** — Replaced `sh -c` with direct `exec.CommandContext` + `strings.Fields` argument splitting. No shell interpretation of pipes, redirects, or variable expansion. + ## [0.9.2] - 2026-03-13 ### Fixed From 6f8fa52bf9465ec0d666a9fc30b9be26e11629bf Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:35:06 +0700 Subject: [PATCH 13/17] feat: add dry-run rerun/retry event type constants Add 4 new EventDetailType constants for dry-run rerun/retry observability: DRY_RUN_WOULD_RERUN, DRY_RUN_RERUN_REJECTED, DRY_RUN_WOULD_RETRY, DRY_RUN_RETRY_EXHAUSTED. --- pkg/types/events.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/types/events.go b/pkg/types/events.go index dd0c2cb..f712ea9 100644 --- a/pkg/types/events.go +++ b/pkg/types/events.go @@ -42,6 +42,10 @@ const ( EventDryRunSLAProjection EventDetailType = "DRY_RUN_SLA_PROJECTION" EventDryRunDrift EventDetailType = "DRY_RUN_DRIFT" EventDryRunCompleted EventDetailType = "DRY_RUN_COMPLETED" + EventDryRunWouldRerun EventDetailType = "DRY_RUN_WOULD_RERUN" + EventDryRunRerunRejected EventDetailType = "DRY_RUN_RERUN_REJECTED" + EventDryRunWouldRetry EventDetailType = "DRY_RUN_WOULD_RETRY" + EventDryRunRetryExhausted EventDetailType = "DRY_RUN_RETRY_EXHAUSTED" ) // EventSource is the EventBridge source for all interlock events. From c323cf45c3673705cc3df016ed752c0c23e146a3 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:35:13 +0700 Subject: [PATCH 14/17] feat: dry-run rerun/retry observability with full evaluation Replace the 5-line early returns in handleRerunRequest and handleJobFailure with self-contained evaluation blocks that run all checks (calendar exclusion, rerun limits, circuit breaker, retry budget) and publish observation events instead of executing side effects. New functions: handleDryRunRerunRequest, handleDryRunJobFailure. Tests: 2 updated + 6 new covering all decision branches (would-rerun, calendar-rejected, limit-exceeded, circuit-breaker-reject, no-job- history, would-retry, retry-exhausted, calendar-excluded). --- internal/lambda/rerun.go | 179 +++++++++++++++++- internal/lambda/stream_router_test.go | 250 +++++++++++++++++++++++++- 2 files changed, 415 insertions(+), 14 deletions(-) diff --git a/internal/lambda/rerun.go b/internal/lambda/rerun.go index e44635c..cb94e76 100644 --- a/internal/lambda/rerun.go +++ b/internal/lambda/rerun.go @@ -34,11 +34,10 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even return nil } - // Dry-run pipelines never start real executions. + // Dry-run pipelines evaluate all checks but publish observation events + // instead of executing side effects. if cfg.DryRun { - d.Logger.Info("dry-run: skipping rerun request", - "pipelineId", pipelineID, "schedule", schedule, "date", date) - return nil + return handleDryRunRerunRequest(ctx, d, cfg, pipelineID, schedule, date, record) } // --- Calendar exclusion check (execution date) --- @@ -210,11 +209,10 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, return nil } - // Dry-run pipelines never start real executions. + // Dry-run pipelines evaluate retry logic but publish observation events + // instead of executing side effects. if cfg.DryRun { - d.Logger.Info("dry-run: skipping job failure rerun", - "pipelineId", pipelineID, "schedule", schedule, "date", date) - return nil + return handleDryRunJobFailure(ctx, d, cfg, pipelineID, schedule, date) } maxRetries := cfg.Job.MaxRetries @@ -305,6 +303,171 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, return nil } +// handleDryRunRerunRequest evaluates all rerun checks (calendar, limit, +// circuit breaker) and publishes observation events instead of executing +// side effects. Mirrors the production handleRerunRequest logic. +func handleDryRunRerunRequest(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string, record events.DynamoDBEventRecord) error { + // Calendar exclusion check. + if isExcludedDate(cfg, date) { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: execution date %s excluded by calendar", pipelineID, date), + map[string]interface{}{ + "reason": "excluded by calendar", + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + + // Extract reason from stream record NewImage. Default to "manual". + reason := "manual" + if img := record.Change.NewImage; img != nil { + if r, ok := img["reason"]; ok && r.DataType() == events.DataTypeString { + if v := r.String(); v != "" { + reason = v + } + } + } + + // Rerun limit check. + var budget int + var sources []string + switch reason { + case "data-drift", "late-data": + budget = types.IntOrDefault(cfg.Job.MaxDriftReruns, 1) + sources = []string{"data-drift", "late-data"} + default: + budget = types.IntOrDefault(cfg.Job.MaxManualReruns, 1) + sources = []string{reason} + } + + count, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, sources) + if err != nil { + return fmt.Errorf("dry-run: count reruns by source for %q: %w", pipelineID, err) + } + + if count >= budget { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: limit exceeded (%d/%d)", pipelineID, count, budget), + map[string]interface{}{ + "reason": "limit exceeded", + "rerunCount": count, + "budget": budget, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + + // Circuit breaker (sensor freshness). + cbStatus := "passed" + job, err := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if err != nil { + return fmt.Errorf("dry-run: get latest job event for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + if job == nil { + cbStatus = "skipped (no job history)" + } else if job.Event == types.JobEventSuccess { + fresh, freshErr := checkSensorFreshness(ctx, d, pipelineID, job.SK) + if freshErr != nil { + return fmt.Errorf("dry-run: check sensor freshness for %q: %w", pipelineID, freshErr) + } + if !fresh { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: previous run succeeded and no sensor data has changed", pipelineID), + map[string]interface{}{ + "reason": "circuit breaker", + "circuitBreaker": "rejected", + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + } + + // All checks pass — publish would-rerun event. + if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRerun), pipelineID, schedule, date, + fmt.Sprintf("dry-run: would rerun %s (reason: %s)", pipelineID, reason), + map[string]interface{}{ + "reason": reason, + "circuitBreaker": cbStatus, + "rerunCount": count, + "budget": budget, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRerun, "error", pubErr) + } + + d.Logger.Info("dry-run: would rerun", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "reason", reason) + return nil +} + +// handleDryRunJobFailure evaluates retry logic for a dry-run pipeline and +// publishes observation events instead of executing side effects. +func handleDryRunJobFailure(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string) error { + maxRetries := cfg.Job.MaxRetries + + // Read latest job event for failure category (read-only). + latestJob, jobErr := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if jobErr != nil { + d.Logger.WarnContext(ctx, "dry-run: could not read latest job event for failure category", + "pipelineId", pipelineID, "error", jobErr) + } + if latestJob != nil { + if types.FailureCategory(latestJob.Category) == types.FailurePermanent { + maxRetries = types.IntOrDefault(cfg.Job.MaxCodeRetries, 1) + } + // TRANSIENT, TIMEOUT, or empty → use cfg.Job.MaxRetries (already set). + } + + rerunCount, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, []string{"job-fail-retry"}) + if err != nil { + return fmt.Errorf("dry-run: count reruns for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + if rerunCount >= maxRetries { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + fmt.Sprintf("dry-run: retry limit reached (%d/%d) for %s", rerunCount, maxRetries, pipelineID), + map[string]interface{}{ + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr) + } + return nil + } + + // Calendar exclusion check. + if isExcludedDate(cfg, date) { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + fmt.Sprintf("dry-run: retry skipped for %s: execution date %s excluded by calendar", pipelineID, date), + map[string]interface{}{ + "reason": "excluded by calendar", + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr) + } + return nil + } + + // Under budget — publish would-retry event. + if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRetry), pipelineID, schedule, date, + fmt.Sprintf("dry-run: would retry %s (%d/%d)", pipelineID, rerunCount, maxRetries), + map[string]interface{}{ + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRetry, "error", pubErr) + } + + d.Logger.Info("dry-run: would retry", + "pipelineId", pipelineID, "schedule", schedule, "date", date, + "retries", rerunCount, "maxRetries", maxRetries) + return nil +} + // checkSensorFreshness determines whether any sensor data has been updated // after the given job completed. The job timestamp is extracted from the job // SK (format: JOB#schedule#date#). Returns true if data has diff --git a/internal/lambda/stream_router_test.go b/internal/lambda/stream_router_test.go index f3f7e7d..d1b8508 100644 --- a/internal/lambda/stream_router_test.go +++ b/internal/lambda/stream_router_test.go @@ -4418,7 +4418,7 @@ func TestDryRunPostRunSensor_NoMarker(t *testing.T) { func TestRerun_DryRun_SkipsExecution(t *testing.T) { mock := newMockDDB() - d, sfnMock, _ := testDeps(mock) + d, sfnMock, ebMock := testDeps(mock) cfg := testDryRunConfig() seedConfig(mock, cfg) @@ -4435,18 +4435,180 @@ func TestRerun_DryRun_SkipsExecution(t *testing.T) { // Dry-run pipeline must NOT start an SFN execution. sfnMock.mu.Lock() - defer sfnMock.mu.Unlock() assert.Empty(t, sfnMock.executions, "dry-run pipeline must not start SFN on rerun request") + sfnMock.mu.Unlock() - // No rerun records written (guard fires before any store side effects). + // No rerun records written. count, countErr := d.Store.CountRerunsBySource(context.Background(), "gold-revenue", "stream", "2026-03-01", []string{"manual"}) require.NoError(t, countErr) assert.Zero(t, count, "dry-run must not write rerun records") + + // Must publish DRY_RUN_WOULD_RERUN with circuitBreaker and budget fields. + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunWouldRerun), "expected DRY_RUN_WOULD_RERUN event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRerun) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Contains(t, detail.Detail, "circuitBreaker") + assert.Contains(t, detail.Detail, "budget") + } + } + } +} + +func TestRerun_DryRun_CalendarExcluded(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Schedule.Exclude = &types.ExclusionConfig{Dates: []string{"2026-03-01"}} + seedConfig(mock, cfg) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "excluded by calendar", detail.Detail["reason"]) + } + } + } +} + +func TestRerun_DryRun_LimitExceeded(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Job.MaxManualReruns = intPtr(0) + seedConfig(mock, cfg) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "limit exceeded", detail.Detail["reason"]) + } + } + } +} + +func TestRerun_DryRun_CircuitBreakerReject(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + seedConfig(mock, cfg) + + // Seed a successful job with a millis-epoch timestamp. + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) + + // Seed sensors with timestamps OLDER than the job — data unchanged. + seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ + "status": "ready", + "updatedAt": float64(1000000000000), + }) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "circuit breaker", detail.Detail["reason"]) + assert.Equal(t, "rejected", detail.Detail["circuitBreaker"]) + } + } + } +} + +func TestRerun_DryRun_NoJobHistory(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + seedConfig(mock, cfg) + + // No JOB# events seeded — circuit breaker should report "skipped". + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunWouldRerun), "expected DRY_RUN_WOULD_RERUN event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRerun) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "skipped (no job history)", detail.Detail["circuitBreaker"]) + } + } + } } func TestJobFailure_DryRun_SkipsRerun(t *testing.T) { mock := newMockDDB() - d, sfnMock, _ := testDeps(mock) + d, sfnMock, ebMock := testDeps(mock) cfg := testDryRunConfig() cfg.Job.MaxRetries = 2 @@ -4462,13 +4624,89 @@ func TestJobFailure_DryRun_SkipsRerun(t *testing.T) { // Dry-run pipeline must NOT start an SFN execution. sfnMock.mu.Lock() - defer sfnMock.mu.Unlock() assert.Empty(t, sfnMock.executions, "dry-run pipeline must not start SFN on job failure") + sfnMock.mu.Unlock() - // No rerun records written (guard fires before any store side effects). + // No rerun records written. count, countErr := d.Store.CountRerunsBySource(context.Background(), "gold-revenue", "stream", "2026-03-01", []string{"job-fail-retry"}) require.NoError(t, countErr) assert.Zero(t, count, "dry-run must not write rerun records on job failure") + + // Must publish DRY_RUN_WOULD_RETRY with retries and maxRetries fields. + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunWouldRetry), "expected DRY_RUN_WOULD_RETRY event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRetry) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Contains(t, detail.Detail, "retries") + assert.Contains(t, detail.Detail, "maxRetries") + } + } + } +} + +func TestJobFailure_DryRun_RetryExhausted(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Job.MaxRetries = 0 + seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") + + record := makeJobRecord("gold-revenue", types.JobEventFail) + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRetryExhausted), "expected DRY_RUN_RETRY_EXHAUSTED event") +} + +func TestJobFailure_DryRun_CalendarExcluded(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Job.MaxRetries = 2 + cfg.Schedule.Exclude = &types.ExclusionConfig{Dates: []string{"2026-03-01"}} + seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") + + record := makeJobRecord("gold-revenue", types.JobEventFail) + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRetryExhausted), "expected DRY_RUN_RETRY_EXHAUSTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRetryExhausted) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "excluded by calendar", detail.Detail["reason"]) + } + } + } } // --------------------------------------------------------------------------- From 2e37a8db91eb255fa065254b91cd778725220716 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Fri, 13 Mar 2026 23:35:19 +0700 Subject: [PATCH 15/17] chore: route dry-run rerun events to alert rule Add DRY_RUN_WOULD_RERUN, DRY_RUN_RERUN_REJECTED, DRY_RUN_WOULD_RETRY, and DRY_RUN_RETRY_EXHAUSTED to the EventBridge alert-events pattern. --- deploy/terraform/eventbridge.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deploy/terraform/eventbridge.tf b/deploy/terraform/eventbridge.tf index ba817b4..5fb8e79 100644 --- a/deploy/terraform/eventbridge.tf +++ b/deploy/terraform/eventbridge.tf @@ -119,6 +119,10 @@ resource "aws_cloudwatch_event_rule" "alert_events" { "DRY_RUN_SLA_PROJECTION", "DRY_RUN_DRIFT", "DRY_RUN_COMPLETED", + "DRY_RUN_WOULD_RERUN", + "DRY_RUN_RERUN_REJECTED", + "DRY_RUN_WOULD_RETRY", + "DRY_RUN_RETRY_EXHAUSTED", ] }) } From 10c01b209ba27d6873bb8cea3cc416c575c652a4 Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Sat, 14 Mar 2026 10:36:34 +0700 Subject: [PATCH 16/17] fix: detect EventBridge PutEvents partial failures (GO-C1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit publishEvent now checks FailedEntryCount on PutEventsOutput. AWS EventBridge can return FailedEntryCount > 0 with error == nil for partial failures — these were previously silently discarded. --- internal/lambda/dynstream.go | 12 +++++++- internal/lambda/dynstream_test.go | 49 +++++++++++++++++++++++++++++++ internal/lambda/mock_test.go | 18 ++++++++++-- 3 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 internal/lambda/dynstream_test.go diff --git a/internal/lambda/dynstream.go b/internal/lambda/dynstream.go index 54ce702..5d45ae1 100644 --- a/internal/lambda/dynstream.go +++ b/internal/lambda/dynstream.go @@ -161,7 +161,7 @@ func publishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule, source := types.EventSource detailStr := string(detailJSON) - _, err = d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{ + out, err := d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{ Entries: []ebTypes.PutEventsRequestEntry{ { Source: &source, @@ -174,6 +174,16 @@ func publishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule, if err != nil { return fmt.Errorf("publish %s event: %w", eventType, err) } + if out.FailedEntryCount > 0 { + code, msg := "", "" + if len(out.Entries) > 0 && out.Entries[0].ErrorCode != nil { + code = *out.Entries[0].ErrorCode + if out.Entries[0].ErrorMessage != nil { + msg = *out.Entries[0].ErrorMessage + } + } + return fmt.Errorf("publish %s event: partial failure (code=%s, message=%s)", eventType, code, msg) + } return nil } diff --git a/internal/lambda/dynstream_test.go b/internal/lambda/dynstream_test.go new file mode 100644 index 0000000..73ebeaa --- /dev/null +++ b/internal/lambda/dynstream_test.go @@ -0,0 +1,49 @@ +package lambda + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/service/eventbridge" + ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types" +) + +// testEventBridge is a local EventBridgeAPI implementation for white-box tests. +type testEventBridge struct { + failedEntryCount int32 +} + +func (t *testEventBridge) PutEvents(_ context.Context, _ *eventbridge.PutEventsInput, _ ...func(*eventbridge.Options)) (*eventbridge.PutEventsOutput, error) { + if t.failedEntryCount > 0 { + errCode := "InternalError" + errMsg := "simulated partial failure" + return &eventbridge.PutEventsOutput{ + FailedEntryCount: t.failedEntryCount, + Entries: []ebTypes.PutEventsResultEntry{ + {ErrorCode: &errCode, ErrorMessage: &errMsg}, + }, + }, nil + } + return &eventbridge.PutEventsOutput{}, nil +} + +func TestPublishEvent_PartialFailure(t *testing.T) { + d := &Deps{ + EventBridge: &testEventBridge{failedEntryCount: 1}, + EventBusName: "test-bus", + NowFunc: func() time.Time { return time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) }, + } + + err := publishEvent(context.Background(), d, "test.event", "pipeline-1", "cron", "2025-01-01", "test message") + if err == nil { + t.Fatal("expected error for partial failure, got nil") + } + if !strings.Contains(err.Error(), "partial failure") { + t.Errorf("expected error to contain 'partial failure', got: %s", err.Error()) + } + if !strings.Contains(err.Error(), "InternalError") { + t.Errorf("expected error to contain 'InternalError', got: %s", err.Error()) + } +} diff --git a/internal/lambda/mock_test.go b/internal/lambda/mock_test.go index 29f4c5a..12e2daa 100644 --- a/internal/lambda/mock_test.go +++ b/internal/lambda/mock_test.go @@ -9,6 +9,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/dynamodb" ddbtypes "github.com/aws/aws-sdk-go-v2/service/dynamodb/types" "github.com/aws/aws-sdk-go-v2/service/eventbridge" + ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types" "github.com/aws/aws-sdk-go-v2/service/scheduler" "github.com/aws/aws-sdk-go-v2/service/sfn" @@ -40,9 +41,10 @@ func (m *mockSFN) StartExecution(_ context.Context, input *sfn.StartExecutionInp // --------------------------------------------------------------------------- type mockEventBridge struct { - mu sync.Mutex - events []*eventbridge.PutEventsInput - err error + mu sync.Mutex + events []*eventbridge.PutEventsInput + err error + failedEntryCount int32 } func (m *mockEventBridge) PutEvents(_ context.Context, input *eventbridge.PutEventsInput, _ ...func(*eventbridge.Options)) (*eventbridge.PutEventsOutput, error) { @@ -52,6 +54,16 @@ func (m *mockEventBridge) PutEvents(_ context.Context, input *eventbridge.PutEve return nil, m.err } m.events = append(m.events, input) + if m.failedEntryCount > 0 { + errCode := "InternalError" + errMsg := "simulated partial failure" + return &eventbridge.PutEventsOutput{ + FailedEntryCount: m.failedEntryCount, + Entries: []ebTypes.PutEventsResultEntry{ + {ErrorCode: &errCode, ErrorMessage: &errMsg}, + }, + }, nil + } return &eventbridge.PutEventsOutput{}, nil } From 8c5a1504660a2c176c421807b9f5f4a5d34714bb Mon Sep 17 00:00:00 2001 From: Dustin Smith Date: Sat, 14 Mar 2026 10:36:40 +0700 Subject: [PATCH 17/17] security: add SSRF protection to trigger HTTP clients (SEC-C2) Custom http.Transport with dial-time IP validation rejects connections to private, loopback, link-local, and multicast addresses. Catches all bypass vectors including DNS rebinding and HTTP redirects. Protects HTTP, Airflow, and Databricks triggers against targeting internal endpoints (AWS IMDS, ECS metadata, VPC services). --- internal/trigger/airflow_test.go | 56 +++++++++++++++++++++++ internal/trigger/runner_test.go | 8 ++++ internal/trigger/ssrf.go | 49 ++++++++++++++++++++ internal/trigger/ssrf_test.go | 77 ++++++++++++++++++++++++++++++++ internal/trigger/trigger.go | 15 +++++-- internal/trigger/trigger_test.go | 24 ++++++++++ 6 files changed, 226 insertions(+), 3 deletions(-) create mode 100644 internal/trigger/ssrf.go create mode 100644 internal/trigger/ssrf_test.go diff --git a/internal/trigger/airflow_test.go b/internal/trigger/airflow_test.go index 35b9cce..69d7651 100644 --- a/internal/trigger/airflow_test.go +++ b/internal/trigger/airflow_test.go @@ -26,6 +26,10 @@ func TestExecuteAirflow_Success(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -49,6 +53,10 @@ func TestExecuteAirflow_AuthHeader(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "test_dag", @@ -67,6 +75,10 @@ func TestExecuteAirflow_ServerError(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -91,6 +103,10 @@ func TestCheckAirflowStatus_Success(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil) require.NoError(t, err) assert.Equal(t, "success", state) @@ -105,6 +121,10 @@ func TestCheckAirflowStatus_Running(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil) require.NoError(t, err) assert.Equal(t, "running", state) @@ -119,6 +139,10 @@ func TestCheckAirflowStatus_Failed(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil) require.NoError(t, err) assert.Equal(t, "failed", state) @@ -151,6 +175,10 @@ func TestExecuteAirflow_WithBody(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -183,6 +211,10 @@ func TestExecuteAirflow_MissingDagRunIDInResponse(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -201,6 +233,10 @@ func TestExecuteAirflow_CustomTimeout(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -218,6 +254,10 @@ func TestCheckAirflowStatus_ServerError(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + _, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", nil) assert.Error(t, err) assert.Contains(t, err.Error(), "status 500") @@ -232,6 +272,10 @@ func TestCheckAirflowStatus_MissingStateField(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + _, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", nil) assert.Error(t, err) assert.Contains(t, err.Error(), "response missing state field") @@ -255,6 +299,10 @@ func TestExecuteAirflow_EnvExpansionRestricted(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "test_dag", @@ -287,6 +335,10 @@ func TestCheckAirflowStatus_EnvExpansionRestricted(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + headers := map[string]string{"Authorization": "Bearer ${INTERLOCK_TEST_VAR}/${SECRET_VAR}"} state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", headers) require.NoError(t, err) @@ -306,6 +358,10 @@ func TestCheckAirflowStatus_WithHeaders(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", map[string]string{ "Authorization": "Bearer test-token", }) diff --git a/internal/trigger/runner_test.go b/internal/trigger/runner_test.go index 34b12a4..97c817d 100644 --- a/internal/trigger/runner_test.go +++ b/internal/trigger/runner_test.go @@ -144,6 +144,10 @@ func TestRunner_Execute_HTTPType(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + r := NewRunner() _, err := r.Execute(context.Background(), &types.TriggerConfig{ Type: types.TriggerHTTP, @@ -170,6 +174,10 @@ func TestRunner_Execute_AirflowType(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + r := NewRunner() meta, err := r.Execute(context.Background(), &types.TriggerConfig{ Type: types.TriggerAirflow, diff --git a/internal/trigger/ssrf.go b/internal/trigger/ssrf.go new file mode 100644 index 0000000..164ccb8 --- /dev/null +++ b/internal/trigger/ssrf.go @@ -0,0 +1,49 @@ +package trigger + +import ( + "fmt" + "net" + "net/http" + "syscall" + "time" +) + +// newSSRFSafeTransport clones http.DefaultTransport (preserving HTTP/2, +// keep-alive, idle-conn settings) and replaces the dialer with one whose +// Control hook rejects connections to private/loopback/link-local IPs. +func newSSRFSafeTransport() *http.Transport { + base := http.DefaultTransport.(*http.Transport).Clone() + base.DialContext = (&net.Dialer{ + Timeout: base.TLSHandshakeTimeout, // match the original dialer timeout + KeepAlive: 30 * time.Second, + Control: ssrfDialControl, + }).DialContext + return base +} + +func ssrfDialControl(network, address string, _ syscall.RawConn) error { + host, _, err := net.SplitHostPort(address) + if err != nil { + return fmt.Errorf("ssrf: invalid address %q: %w", address, err) + } + ip := net.ParseIP(host) + if ip == nil { + return fmt.Errorf("ssrf: could not parse IP %q", host) + } + if isBlockedIP(ip) { + return fmt.Errorf("ssrf: connection to %s blocked (private/loopback/link-local)", ip) + } + return nil +} + +func isBlockedIP(ip net.IP) bool { + return ip.IsLoopback() || + ip.IsPrivate() || + ip.IsLinkLocalUnicast() || + ip.IsMulticast() || + ip.IsUnspecified() || + // Explicit IMDS/ECS checks — already covered by IsLinkLocalUnicast + // but kept for visibility since these are the primary SSRF targets. + ip.Equal(net.ParseIP("169.254.169.254")) || + ip.Equal(net.ParseIP("169.254.170.2")) +} diff --git a/internal/trigger/ssrf_test.go b/internal/trigger/ssrf_test.go new file mode 100644 index 0000000..3d420b2 --- /dev/null +++ b/internal/trigger/ssrf_test.go @@ -0,0 +1,77 @@ +package trigger + +import ( + "net" + "testing" +) + +func TestIsBlockedIP(t *testing.T) { + blocked := []struct { + name string + ip string + }{ + {"loopback_v4", "127.0.0.1"}, + {"private_10", "10.0.0.1"}, + {"private_172", "172.16.0.1"}, + {"private_192", "192.168.1.1"}, + {"aws_imds", "169.254.169.254"}, + {"ecs_metadata", "169.254.170.2"}, + {"loopback_v6", "::1"}, + {"link_local_v6", "fe80::1"}, + {"unspecified", "0.0.0.0"}, + } + for _, tc := range blocked { + t.Run(tc.name, func(t *testing.T) { + ip := net.ParseIP(tc.ip) + if ip == nil { + t.Fatalf("failed to parse IP %s", tc.ip) + } + if !isBlockedIP(ip) { + t.Errorf("expected %s to be blocked", tc.ip) + } + }) + } + + allowed := []struct { + name string + ip string + }{ + {"google_dns", "8.8.8.8"}, + {"aws_public", "52.94.76.1"}, + {"google_v6", "2607:f8b0:4004:800::200e"}, + } + for _, tc := range allowed { + t.Run(tc.name, func(t *testing.T) { + ip := net.ParseIP(tc.ip) + if ip == nil { + t.Fatalf("failed to parse IP %s", tc.ip) + } + if isBlockedIP(ip) { + t.Errorf("expected %s to be allowed", tc.ip) + } + }) + } +} + +func TestSSRFDialControl(t *testing.T) { + t.Run("blocks_loopback", func(t *testing.T) { + err := ssrfDialControl("tcp", "127.0.0.1:80", nil) + if err == nil { + t.Error("expected error for loopback address") + } + }) + + t.Run("allows_public", func(t *testing.T) { + err := ssrfDialControl("tcp", "8.8.8.8:443", nil) + if err != nil { + t.Errorf("expected no error for public address, got: %v", err) + } + }) + + t.Run("blocks_imds", func(t *testing.T) { + err := ssrfDialControl("tcp", "169.254.169.254:80", nil) + if err == nil { + t.Error("expected error for IMDS address") + } + }) +} diff --git a/internal/trigger/trigger.go b/internal/trigger/trigger.go index 6006ba6..cefa6fe 100644 --- a/internal/trigger/trigger.go +++ b/internal/trigger/trigger.go @@ -45,16 +45,25 @@ const maxErrorBodyBytes = 512 const defaultTriggerTimeout = 30 * time.Second // defaultHTTPClient is shared across HTTP and Airflow triggers to reuse connections. -var defaultHTTPClient = &http.Client{Timeout: defaultTriggerTimeout} +// It uses an SSRF-safe transport that rejects private, loopback, and link-local addresses. +var defaultHTTPClient = &http.Client{ + Timeout: defaultTriggerTimeout, + Transport: newSSRFSafeTransport(), +} // resolveHTTPClient returns a client with the given timeout in seconds. If // timeoutSec is zero or matches the default, the shared defaultHTTPClient is -// returned to reuse connections. +// returned to reuse connections. When a custom timeout is required, the returned +// client inherits the transport from defaultHTTPClient so that transport-level +// settings (including SSRF protection and test overrides) are preserved. func resolveHTTPClient(timeoutSec int) *http.Client { if timeoutSec > 0 { timeout := time.Duration(timeoutSec) * time.Second if timeout != defaultTriggerTimeout { - return &http.Client{Timeout: timeout} + return &http.Client{ + Timeout: timeout, + Transport: defaultHTTPClient.Transport, + } } } return defaultHTTPClient diff --git a/internal/trigger/trigger_test.go b/internal/trigger/trigger_test.go index cc45993..84efb65 100644 --- a/internal/trigger/trigger_test.go +++ b/internal/trigger/trigger_test.go @@ -38,6 +38,10 @@ func TestExecuteHTTP_Success(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "POST", URL: srv.URL}, @@ -82,6 +86,10 @@ func TestExecuteHTTP_ErrorBodyTruncated(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL}, @@ -110,6 +118,10 @@ func TestExecuteHTTP_ErrorBodySanitized(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL}, @@ -154,6 +166,10 @@ func TestExecuteHTTP_EnvExpansionRestricted(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{ @@ -229,6 +245,10 @@ func TestExecuteHTTP_Returns_TriggerError_On4xx(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL}, @@ -249,6 +269,10 @@ func TestExecuteHTTP_Returns_TriggerError_On5xx(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL},