diff --git a/CHANGELOG.md b/CHANGELOG.md index 26008b7..f796f43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,40 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.3] - 2026-03-13 + +### Changed + +- **Shared HTTP client construction (DRY-2)** — Extracted `resolveHTTPClient()` replacing identical 7-line blocks in `ExecuteHTTP` and `ExecuteAirflow`. +- **Shared SLA schedule creation loop (DRY-3)** — Extracted `createSLASchedules()` replacing duplicated warning/breach schedule loops in watchdog and sla-monitor. +- **Split watchdog.go into focused files** — 1079-line monolith split into 5 files by domain: stale triggers, missed schedules, SLA alerting, and post-run monitoring (~200 lines each). + +### Security + +- **Command trigger shell injection eliminated (SEC-3)** — Replaced `sh -c` with direct `exec.CommandContext` + `strings.Fields` argument splitting. No shell interpretation of pipes, redirects, or variable expansion. + +## [0.9.2] - 2026-03-13 + +### Fixed + +- **Drift detection silently skipped zero values (BUG-1)** — `ExtractFloat` returned 0 for both missing keys and actual zero values, causing the `prevCount > 0` guard to silently skip legitimate transitions like 5000→0 or 0→5000. New `ExtractFloatOk` distinguishes absent from zero. Shared `DetectDrift` function consolidates 3 duplicated drift comparison sites. +- **RemapPerPeriodSensors map mutation during range (BUG-2)** — Adding keys during `range` iteration over a Go map is nondeterministic per the spec. Staging map now collects additions, merged after iteration. +- **Orphaned rerun burns retry budget (BUG-3)** — `handleRerunRequest` wrote the rerun record before acquiring the trigger lock. If lock acquisition failed, the rerun record was left orphaned and permanently consumed retry budget. Reordered to lock-first, then write. +- **Stream router discarded partial batch failures (BUG-4)** — `HandleStreamEvent` returned a single error, causing Lambda to retry the entire batch. Now returns `DynamoDBEventResponse` with per-record `BatchItemFailures` for partial retry via `ReportBatchItemFailures`. +- **SLA_MET published when pipeline never ran (BUG-5)** — `handleSLACancel` published SLA_MET regardless of whether a trigger existed. Now checks for trigger existence first. +- **Trigger deadline used SLA timezone instead of schedule timezone (BUG-6)** — `closeSensorTriggerWindow` read timezone from `cfg.SLA.Timezone` instead of `cfg.Schedule.Timezone`. Falls back to SLA timezone if schedule timezone is not set. +- **Validation mode case-sensitive (BUG-8)** — `EvaluateRules` matched mode with `switch mode` so "any" fell through to the default ALL branch. Now uses `strings.ToUpper(mode)`. +- **Epoch timestamp unit mismatch in rerun freshness (BUG-9)** — `checkSensorFreshness` compared raw epoch values without normalizing units. Timestamps below 1e12 (seconds) are now converted to milliseconds. +- **Post-run baseline field collision (BUG-10)** — Baseline was stored as a flat map, so two rules with the same field name overwrote each other. Now namespaced by rule key. Clean break: existing flat baselines self-heal on next pipeline completion. +- **publishEvent errors silently discarded in SLA reconcile (CQ-5)** — Replaced `_ = publishEvent(...)` with error-logged calls. + +### Security + +- **lambda_trigger_arns default changed to [] with precondition (SEC-1)** — Wildcard default removed; explicit ARN list required when triggers are enabled. +- **Slack plaintext token deprecation warning (SEC-2)** — Terraform `check` block warns at plan time when plaintext token is used without Secrets Manager. +- **Trigger IAM policy scoping (SEC-4)** — New variables `glue_job_arns`, `emr_cluster_arns`, `emr_serverless_app_arns`, `sfn_trigger_arns` (all default `[]`) with preconditions requiring non-empty values when the corresponding trigger is enabled. +- **EventBridge bus resource policy (SEC-5)** — Restricts PutEvents to Lambda execution roles only. + ## [0.9.1] - 2026-03-13 ### Added diff --git a/cmd/lambda/stream-router/main.go b/cmd/lambda/stream-router/main.go index 1354881..60c2430 100644 --- a/cmd/lambda/stream-router/main.go +++ b/cmd/lambda/stream-router/main.go @@ -10,6 +10,7 @@ import ( "os" "time" + "github.com/aws/aws-lambda-go/events" "github.com/aws/aws-lambda-go/lambda" awsconfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/dynamodb" @@ -53,7 +54,7 @@ func main() { Logger: logger, } - lambda.Start(func(ctx context.Context, event ilambda.StreamEvent) error { + lambda.Start(func(ctx context.Context, event ilambda.StreamEvent) (events.DynamoDBEventResponse, error) { return ilambda.HandleStreamEvent(ctx, deps, event) }) } diff --git a/deploy/terraform/eventbridge.tf b/deploy/terraform/eventbridge.tf index c55a45a..5fb8e79 100644 --- a/deploy/terraform/eventbridge.tf +++ b/deploy/terraform/eventbridge.tf @@ -3,6 +3,25 @@ resource "aws_cloudwatch_event_bus" "interlock" { tags = var.tags } +resource "aws_cloudwatch_event_bus_policy" "interlock_bus" { + event_bus_name = aws_cloudwatch_event_bus.interlock.name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AllowInterlockLambdas" + Effect = "Allow" + Principal = { + AWS = [for name in local.lambda_names : aws_iam_role.lambda[name].arn] + } + Action = "events:PutEvents" + Resource = aws_cloudwatch_event_bus.interlock.arn + } + ] + }) +} + # Watchdog schedule resource "aws_cloudwatch_event_rule" "watchdog" { name = "${var.environment}-interlock-watchdog" @@ -100,6 +119,10 @@ resource "aws_cloudwatch_event_rule" "alert_events" { "DRY_RUN_SLA_PROJECTION", "DRY_RUN_DRIFT", "DRY_RUN_COMPLETED", + "DRY_RUN_WOULD_RERUN", + "DRY_RUN_RERUN_REJECTED", + "DRY_RUN_WOULD_RETRY", + "DRY_RUN_RETRY_EXHAUSTED", ] }) } diff --git a/deploy/terraform/lambda.tf b/deploy/terraform/lambda.tf index 0d636c1..d3cf012 100644 --- a/deploy/terraform/lambda.tf +++ b/deploy/terraform/lambda.tf @@ -541,6 +541,17 @@ resource "aws_lambda_event_source_mapping" "joblog_stream" { } } +# ============================================================================= +# Security checks +# ============================================================================= + +check "slack_token_deprecation" { + assert { + condition = var.slack_bot_token == "" || var.slack_secret_arn != "" + error_message = "DEPRECATED: Passing a plaintext Slack bot token is deprecated. Use var.slack_secret_arn with an AWS Secrets Manager ARN instead. Plaintext path still works but will be removed in a future version." + } +} + # ============================================================================= # Conditional trigger permissions for orchestrator (opt-in per trigger type) # ============================================================================= @@ -552,13 +563,20 @@ resource "aws_iam_role_policy" "glue_trigger" { name = "glue-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.glue_job_arns) > 0 + error_message = "glue_job_arns must be non-empty when enable_glue_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [ { Effect = "Allow" Action = ["glue:StartJobRun", "glue:GetJobRun"] - Resource = "*" + Resource = var.glue_job_arns }, { Sid = "GlueLogVerification" @@ -580,12 +598,19 @@ resource "aws_iam_role_policy" "emr_trigger" { name = "emr-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.emr_cluster_arns) > 0 + error_message = "emr_cluster_arns must be non-empty when enable_emr_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ Effect = "Allow" Action = ["elasticmapreduce:AddJobFlowSteps", "elasticmapreduce:DescribeStep"] - Resource = "*" + Resource = var.emr_cluster_arns }] }) } @@ -597,12 +622,19 @@ resource "aws_iam_role_policy" "emr_serverless_trigger" { name = "emr-serverless-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.emr_serverless_app_arns) > 0 + error_message = "emr_serverless_app_arns must be non-empty when enable_emr_serverless_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ Effect = "Allow" Action = ["emr-serverless:StartJobRun", "emr-serverless:GetJobRun"] - Resource = "*" + Resource = var.emr_serverless_app_arns }] }) } @@ -614,12 +646,19 @@ resource "aws_iam_role_policy" "sfn_trigger" { name = "sfn-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.sfn_trigger_arns) > 0 + error_message = "sfn_trigger_arns must be non-empty when enable_sfn_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ Effect = "Allow" Action = ["states:StartExecution", "states:DescribeExecution"] - Resource = "*" + Resource = var.sfn_trigger_arns }] }) } @@ -631,6 +670,13 @@ resource "aws_iam_role_policy" "lambda_trigger" { name = "lambda-trigger" role = aws_iam_role.lambda["orchestrator"].id + lifecycle { + precondition { + condition = length(var.lambda_trigger_arns) > 0 + error_message = "lambda_trigger_arns must be non-empty when enable_lambda_trigger is true." + } + } + policy = jsonencode({ Version = "2012-10-17" Statement = [{ diff --git a/deploy/terraform/variables.tf b/deploy/terraform/variables.tf index 8853cac..eb7615a 100644 --- a/deploy/terraform/variables.tf +++ b/deploy/terraform/variables.tf @@ -139,5 +139,29 @@ variable "enable_lambda_trigger" { variable "lambda_trigger_arns" { description = "ARNs of Lambda functions the orchestrator may invoke as pipeline triggers" type = list(string) - default = ["*"] + default = [] +} + +variable "glue_job_arns" { + description = "ARNs of Glue jobs that the orchestrator Lambda can start. Required when enable_glue_trigger is true." + type = list(string) + default = [] +} + +variable "emr_cluster_arns" { + description = "ARNs of EMR clusters the orchestrator can submit steps to. Required when enable_emr_trigger is true." + type = list(string) + default = [] +} + +variable "emr_serverless_app_arns" { + description = "ARNs of EMR Serverless applications. Required when enable_emr_serverless_trigger is true." + type = list(string) + default = [] +} + +variable "sfn_trigger_arns" { + description = "ARNs of Step Functions the orchestrator can start. Required when enable_sfn_trigger is true." + type = list(string) + default = [] } diff --git a/internal/lambda/drift.go b/internal/lambda/drift.go new file mode 100644 index 0000000..7088aa6 --- /dev/null +++ b/internal/lambda/drift.go @@ -0,0 +1,63 @@ +package lambda + +import ( + "math" + "strconv" +) + +// ExtractFloatOk retrieves a numeric value from a sensor data map. +// Returns (value, true) if the key exists and is numeric, (0, false) otherwise. +// Unlike ExtractFloat, this distinguishes zero values from missing keys. +func ExtractFloatOk(data map[string]interface{}, key string) (float64, bool) { + if data == nil { + return 0, false + } + v, ok := data[key] + if !ok { + return 0, false + } + switch n := v.(type) { + case float64: + return n, true + case string: + f, err := strconv.ParseFloat(n, 64) + if err != nil { + return 0, false + } + return f, true + default: + return 0, false + } +} + +// DriftResult holds the outcome of a drift comparison. +type DriftResult struct { + Drifted bool + Previous float64 + Current float64 + Delta float64 + PrevFound bool + CurrFound bool +} + +// DetectDrift compares baseline and current sensor data for a drift field. +// Both values must be present for drift to be detected. Returns whether +// the absolute delta exceeds the threshold. +func DetectDrift(baseline, current map[string]interface{}, driftField string, threshold float64) DriftResult { + prev, prevOk := ExtractFloatOk(baseline, driftField) + curr, currOk := ExtractFloatOk(current, driftField) + + result := DriftResult{ + Previous: prev, + Current: curr, + PrevFound: prevOk, + CurrFound: currOk, + } + + if prevOk && currOk { + result.Delta = curr - prev + result.Drifted = math.Abs(result.Delta) > threshold + } + + return result +} diff --git a/internal/lambda/drift_test.go b/internal/lambda/drift_test.go new file mode 100644 index 0000000..936234e --- /dev/null +++ b/internal/lambda/drift_test.go @@ -0,0 +1,62 @@ +package lambda + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestExtractFloatOk(t *testing.T) { + tests := []struct { + name string + data map[string]interface{} + key string + wantVal float64 + wantOk bool + }{ + {"present float", map[string]interface{}{"count": float64(42)}, "count", 42, true}, + {"present zero", map[string]interface{}{"count": float64(0)}, "count", 0, true}, + {"present string", map[string]interface{}{"count": "123.5"}, "count", 123.5, true}, + {"missing key", map[string]interface{}{}, "count", 0, false}, + {"nil map", nil, "count", 0, false}, + {"wrong type", map[string]interface{}{"count": true}, "count", 0, false}, + {"invalid string", map[string]interface{}{"count": "abc"}, "count", 0, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + val, ok := ExtractFloatOk(tt.data, tt.key) + assert.Equal(t, tt.wantOk, ok) + assert.InDelta(t, tt.wantVal, val, 0.001) + }) + } +} + +func TestDetectDrift(t *testing.T) { + m := func(k string, v float64) map[string]interface{} { + return map[string]interface{}{k: v} + } + tests := []struct { + name string + baseline map[string]interface{} + current map[string]interface{} + field string + threshold float64 + wantDrift bool + }{ + {"5000→0 drifts", m("count", 5000.0), m("count", 0.0), "count", 0, true}, + {"0→5000 drifts", m("count", 0.0), m("count", 5000.0), "count", 0, true}, + {"same value no drift", m("count", 100.0), m("count", 100.0), "count", 0, false}, + {"within threshold", m("count", 100.0), m("count", 150.0), "count", 100, false}, + {"exceeds threshold", m("count", 100.0), m("count", 250.0), "count", 100, true}, + {"prev missing no drift", map[string]interface{}{}, m("count", 100.0), "count", 0, false}, + {"curr missing no drift", m("count", 100.0), map[string]interface{}{}, "count", 0, false}, + {"both missing no drift", map[string]interface{}{}, map[string]interface{}{}, "count", 0, false}, + {"negative drift", m("count", 100.0), m("count", 50.0), "count", 0, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := DetectDrift(tt.baseline, tt.current, tt.field, tt.threshold) + assert.Equal(t, tt.wantDrift, result.Drifted) + }) + } +} diff --git a/internal/lambda/dryrun.go b/internal/lambda/dryrun.go index d9a716b..eff6514 100644 --- a/internal/lambda/dryrun.go +++ b/internal/lambda/dryrun.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "math" + "strings" "time" "github.com/dwsmith1983/interlock/internal/validation" @@ -227,22 +228,34 @@ func handleDryRunPostRunSensor(ctx context.Context, d *Deps, cfg *types.Pipeline return nil } + // Find matching post-run rule for this sensor key. + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break + } + } + if ruleBaseline == nil { + return nil // No baseline for this rule (stale or first run). + } + // Compare drift. driftField := resolveDriftField(cfg.PostRun) - prevCount := ExtractFloat(baseline, driftField) - currCount := ExtractFloat(sensorData, driftField) threshold := 0.0 if cfg.PostRun.DriftThreshold != nil { threshold = *cfg.PostRun.DriftThreshold } - - if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold { + dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { if pubErr := publishEvent(ctx, d, string(types.EventDryRunDrift), pipelineID, scheduleID, date, - fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, prevCount, currCount), + fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, dr.Previous, dr.Current), map[string]interface{}{ - "previousCount": prevCount, - "currentCount": currCount, - "delta": currCount - prevCount, + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, "driftThreshold": threshold, "driftField": driftField, "sensorKey": sensorKey, diff --git a/internal/lambda/dynstream.go b/internal/lambda/dynstream.go index 54ce702..5d45ae1 100644 --- a/internal/lambda/dynstream.go +++ b/internal/lambda/dynstream.go @@ -161,7 +161,7 @@ func publishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule, source := types.EventSource detailStr := string(detailJSON) - _, err = d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{ + out, err := d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{ Entries: []ebTypes.PutEventsRequestEntry{ { Source: &source, @@ -174,6 +174,16 @@ func publishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule, if err != nil { return fmt.Errorf("publish %s event: %w", eventType, err) } + if out.FailedEntryCount > 0 { + code, msg := "", "" + if len(out.Entries) > 0 && out.Entries[0].ErrorCode != nil { + code = *out.Entries[0].ErrorCode + if out.Entries[0].ErrorMessage != nil { + msg = *out.Entries[0].ErrorMessage + } + } + return fmt.Errorf("publish %s event: partial failure (code=%s, message=%s)", eventType, code, msg) + } return nil } diff --git a/internal/lambda/dynstream_test.go b/internal/lambda/dynstream_test.go new file mode 100644 index 0000000..73ebeaa --- /dev/null +++ b/internal/lambda/dynstream_test.go @@ -0,0 +1,49 @@ +package lambda + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/service/eventbridge" + ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types" +) + +// testEventBridge is a local EventBridgeAPI implementation for white-box tests. +type testEventBridge struct { + failedEntryCount int32 +} + +func (t *testEventBridge) PutEvents(_ context.Context, _ *eventbridge.PutEventsInput, _ ...func(*eventbridge.Options)) (*eventbridge.PutEventsOutput, error) { + if t.failedEntryCount > 0 { + errCode := "InternalError" + errMsg := "simulated partial failure" + return &eventbridge.PutEventsOutput{ + FailedEntryCount: t.failedEntryCount, + Entries: []ebTypes.PutEventsResultEntry{ + {ErrorCode: &errCode, ErrorMessage: &errMsg}, + }, + }, nil + } + return &eventbridge.PutEventsOutput{}, nil +} + +func TestPublishEvent_PartialFailure(t *testing.T) { + d := &Deps{ + EventBridge: &testEventBridge{failedEntryCount: 1}, + EventBusName: "test-bus", + NowFunc: func() time.Time { return time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) }, + } + + err := publishEvent(context.Background(), d, "test.event", "pipeline-1", "cron", "2025-01-01", "test message") + if err == nil { + t.Fatal("expected error for partial failure, got nil") + } + if !strings.Contains(err.Error(), "partial failure") { + t.Errorf("expected error to contain 'partial failure', got: %s", err.Error()) + } + if !strings.Contains(err.Error(), "InternalError") { + t.Errorf("expected error to contain 'InternalError', got: %s", err.Error()) + } +} diff --git a/internal/lambda/e2e_test.go b/internal/lambda/e2e_test.go index 7f66514..b32008d 100644 --- a/internal/lambda/e2e_test.go +++ b/internal/lambda/e2e_test.go @@ -276,7 +276,7 @@ func runSFN(t *testing.T, ctx context.Context, d *lambda.Deps, mock *mockDDB, eb // Simulate stream event for each sensor update. sensorRecord := makeSensorRecord(pid, key, toStreamAttributes(data)) streamEvt := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{sensorRecord}} - _ = lambda.HandleStreamEvent(ctx, d, streamEvt) + _, _ = lambda.HandleStreamEvent(ctx, d, streamEvt) } } } @@ -1021,7 +1021,7 @@ func TestE2E_AutoRetries(t *testing.T) { require.NotEmpty(t, jobSK, "should have a joblog entry") sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d1", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d1", jobSK, "fail")) require.NoError(t, err) // Verify: new SFN execution started (auto-retry under maxRetries limit) @@ -1055,7 +1055,7 @@ func TestE2E_AutoRetries(t *testing.T) { eb.events = nil eb.mu.Unlock() - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d2", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d2", jobSK, "fail")) require.NoError(t, err) // Verify: no new SFN, RETRY_EXHAUSTED published, status=FAILED_FINAL @@ -1104,7 +1104,7 @@ func TestE2E_FailureClassification(t *testing.T) { eb.events = nil eb.mu.Unlock() - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc1", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc1", jobSK, "fail")) require.NoError(t, err) // Verify: no retry (MaxCodeRetries=0), RETRY_EXHAUSTED event @@ -1140,7 +1140,7 @@ func TestE2E_FailureClassification(t *testing.T) { require.NotEmpty(t, jobSK) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc2", jobSK, "fail")) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc2", jobSK, "fail")) require.NoError(t, err) sfnM.mu.Lock() @@ -1184,7 +1184,7 @@ func TestE2E_RerunReplay(t *testing.T) { // Process RERUN_REQUEST stream event sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e1")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e1")) require.NoError(t, err) // Verify: new SFN started, rerun-accepted joblog written @@ -1221,7 +1221,7 @@ func TestE2E_RerunReplay(t *testing.T) { })) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e2")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e2")) require.NoError(t, err) // Verify: no SFN, RERUN_REJECTED published @@ -1257,7 +1257,7 @@ func TestE2E_RerunReplay(t *testing.T) { "status": events.NewStringAttribute("ready"), "date": events.NewStringAttribute("2026-03-07"), }) - err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + _, err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) require.NoError(t, err) // Lock already held → late data path @@ -1306,7 +1306,7 @@ func TestE2E_DriftRetrigger(t *testing.T) { // Phase 2: Stream-router processes RERUN_REQUEST sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f1")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f1")) require.NoError(t, err) // Verify: new SFN started for re-trigger @@ -1343,7 +1343,7 @@ func TestE2E_DriftRetrigger(t *testing.T) { assert.Contains(t, r.events, "POST_RUN_DRIFT") sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f2")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f2")) require.NoError(t, err) sfnM.mu.Lock() @@ -1380,7 +1380,7 @@ func TestE2E_DriftRetrigger(t *testing.T) { // Phase 2: verify the RERUN_REQUEST was written, allowing re-trigger sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f3")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f3")) require.NoError(t, err) sfnM.mu.Lock() @@ -1432,7 +1432,7 @@ func TestE2E_RerunLimits(t *testing.T) { // Send a data-drift RERUN_REQUEST — should be rejected sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl1", "data-drift")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl1", "data-drift")) require.NoError(t, err) // Verify: no SFN started, RERUN_REJECTED event + joblog entry @@ -1480,7 +1480,7 @@ func TestE2E_RerunLimits(t *testing.T) { // Send a late-data RERUN_REQUEST — should be rejected because // late-data shares the drift budget (count 1 >= budget 1) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl2", "late-data")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl2", "late-data")) require.NoError(t, err) // Verify: no SFN started, RERUN_REJECTED event + joblog entry @@ -1791,7 +1791,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) { }) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + _, err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) require.NoError(t, err) // Verify: SFN started, trigger lock acquired, JOB_TRIGGERED event published. @@ -1824,7 +1824,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) { }) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-i2", jobSK, types.JobEventTimeout)) + _, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-i2", jobSK, types.JobEventTimeout)) require.NoError(t, err) // Verify: auto-retry started (timeout is retryable just like fail). @@ -1861,7 +1861,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) { })) sfnCountBefore := len(sfnM.executions) - err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-i3")) + _, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-i3")) require.NoError(t, err) // Verify: rerun accepted despite old sensor data (failure skips freshness check). @@ -2067,32 +2067,37 @@ func TestE2E_RerunBudgetSeparation(t *testing.T) { // Phase 1: First drift rerun — accepted (0 < budget 1). sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift"))) + _, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first drift rerun should start SFN") // Phase 2: Second drift rerun — rejected (1 >= budget 1). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "second drift rerun should NOT start SFN") assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED") // Phase 3: First manual rerun — accepted despite drift budget exhausted. resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first manual rerun should succeed") // Phase 4: Second manual rerun — accepted (1 < budget 2). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "second manual rerun should succeed") // Phase 5: Third manual rerun — rejected (2 >= budget 2). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "third manual rerun should NOT start SFN") assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED") assertAlertFormats(t, eb) @@ -2119,16 +2124,17 @@ func TestE2E_PostRunInflight(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "pipe-inf1", "2026-03-07", types.TriggerStatusRunning) - // Baseline from a previous run. + // Baseline from a previous run (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf1", "postrun-baseline#2026-03-07", - map[string]interface{}{"sensor_count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}})) // Sensor arrives with different count while job is running. record := makeSensorRecord("pipe-inf1", "audit-result", toStreamAttributes(map[string]interface{}{ "sensor_count": float64(200), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT_INFLIGHT") assert.False(t, hasRerunRequest(mock, "pipe-inf1"), "should NOT write rerun request while running") @@ -2150,16 +2156,17 @@ func TestE2E_PostRunInflight(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "pipe-inf-cf", "2026-03-07", types.TriggerStatusRunning) - // Baseline uses custom field "count". + // Baseline uses custom field "count" (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf-cf", "postrun-baseline#2026-03-07", - map[string]interface{}{"count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"count": float64(100)}})) // Sensor arrives with different count while job is running. record := makeSensorRecord("pipe-inf-cf", "audit-result", toStreamAttributes(map[string]interface{}{ "count": float64(200), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT_INFLIGHT") assert.False(t, hasRerunRequest(mock, "pipe-inf-cf"), "should NOT write rerun request while running") @@ -2180,15 +2187,16 @@ func TestE2E_PostRunInflight(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "pipe-inf2", "2026-03-07", types.TriggerStatusRunning) - // Baseline matches incoming sensor — no drift. + // Baseline matches incoming sensor — no drift (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf2", "postrun-baseline#2026-03-07", - map[string]interface{}{"sensor_count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}})) record := makeSensorRecord("pipe-inf2", "audit-result", toStreamAttributes(map[string]interface{}{ "sensor_count": float64(100), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Empty(t, collectEventTypes(eb)) assert.Equal(t, 0, countSFNExecutions(sfnM)) @@ -2217,7 +2225,8 @@ func TestE2E_CalendarExclusionFullSkip(t *testing.T) { record := makeSensorRecord("pipe-cal1", "upstream-complete", map[string]events.DynamoDBAttributeValue{"status": events.NewStringAttribute("ready")}) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Equal(t, 0, countSFNExecutions(sfnM)) assertNoTriggerLock(t, mock, "pipe-cal1", "stream", today) @@ -2266,7 +2275,8 @@ func TestE2E_HourBoundaryRollover(t *testing.T) { "date": "20260307", "hour": "23", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record23}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record23}}) + require.NoError(t, handleErr) // Hour 00 (next day) sensor arrives. record00 := makeSensorRecord("pipe-hr1", "hourly-status#20260308T00", toStreamAttributes(map[string]interface{}{ @@ -2274,7 +2284,8 @@ func TestE2E_HourBoundaryRollover(t *testing.T) { "date": "20260308", "hour": "00", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record00}})) + _, handleErr = lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record00}}) + require.NoError(t, handleErr) // Two independent SFN executions. sfnM.mu.Lock() @@ -2317,28 +2328,31 @@ func TestE2E_ConcurrentDriftDedup(t *testing.T) { seedConfig(mock, cfg) seedCompletedPipelineE2E(t, ctx, d, mock, "pipe-cd1", "2026-03-07") - // Baseline captured at completion. + // Baseline captured at completion (namespaced by rule key). require.NoError(t, d.Store.WriteSensor(ctx, "pipe-cd1", "postrun-baseline#2026-03-07", - map[string]interface{}{"sensor_count": float64(100)})) + map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}})) // First drift sensor arrives. record1 := makeSensorRecord("pipe-cd1", "audit-result", toStreamAttributes(map[string]interface{}{ "sensor_count": float64(200), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1}}) + require.NoError(t, handleErr) assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT") // Process first rerun request — accepted. resetEventBridge(eb) sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first drift rerun accepted") // Process second rerun request — rejected (budget exhausted). resetEventBridge(eb) sfnBefore = countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift"))) + _, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "second drift rerun rejected") assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED") assertAlertFormats(t, eb) @@ -2370,7 +2384,8 @@ func TestE2E_PostRunBeforeBaseline(t *testing.T) { "sensor_count": float64(500), "date": "2026-03-07", })) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})) + _, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}) + require.NoError(t, handleErr) assert.Empty(t, collectEventTypes(eb), "should not publish any event when baseline is missing") assert.False(t, hasRerunRequest(mock, "pipe-nb1")) @@ -2412,7 +2427,8 @@ func TestE2E_RerunAfterTriggerTTLExpiry(t *testing.T) { })) sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl1", "manual"))) + _, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl1", "manual")) + require.NoError(t, handleErr) assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "no SFN when trigger lock row was deleted by TTL") // Should have published an INFRA_FAILURE event. @@ -2457,7 +2473,8 @@ func TestE2E_RerunAfterTriggerTTLExpiry(t *testing.T) { }) sfnBefore := countSFNExecutions(sfnM) - require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl2", "manual"))) + _, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl2", "manual")) + require.NoError(t, handleErr) assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "rerun should start SFN when trigger lock exists") assertAlertFormats(t, eb) }) diff --git a/internal/lambda/export_test.go b/internal/lambda/export_test.go index 738c420..3c5f829 100644 --- a/internal/lambda/export_test.go +++ b/internal/lambda/export_test.go @@ -3,8 +3,16 @@ // even to files in the non-_test package when placed here). package lambda -import "github.com/dwsmith1983/interlock/pkg/types" +import ( + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) // IsExcludedDate re-exports isExcludedDate for white-box unit testing from // the external test package (package lambda_test). var IsExcludedDate func(cfg *types.PipelineConfig, dateStr string) bool = isExcludedDate + +// ResolveTriggerDeadlineTime re-exports resolveTriggerDeadlineTime for +// white-box unit testing from the external test package (package lambda_test). +var ResolveTriggerDeadlineTime func(deadline, date, timezone string) time.Time = resolveTriggerDeadlineTime diff --git a/internal/lambda/mock_test.go b/internal/lambda/mock_test.go index 29f4c5a..12e2daa 100644 --- a/internal/lambda/mock_test.go +++ b/internal/lambda/mock_test.go @@ -9,6 +9,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/dynamodb" ddbtypes "github.com/aws/aws-sdk-go-v2/service/dynamodb/types" "github.com/aws/aws-sdk-go-v2/service/eventbridge" + ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types" "github.com/aws/aws-sdk-go-v2/service/scheduler" "github.com/aws/aws-sdk-go-v2/service/sfn" @@ -40,9 +41,10 @@ func (m *mockSFN) StartExecution(_ context.Context, input *sfn.StartExecutionInp // --------------------------------------------------------------------------- type mockEventBridge struct { - mu sync.Mutex - events []*eventbridge.PutEventsInput - err error + mu sync.Mutex + events []*eventbridge.PutEventsInput + err error + failedEntryCount int32 } func (m *mockEventBridge) PutEvents(_ context.Context, input *eventbridge.PutEventsInput, _ ...func(*eventbridge.Options)) (*eventbridge.PutEventsOutput, error) { @@ -52,6 +54,16 @@ func (m *mockEventBridge) PutEvents(_ context.Context, input *eventbridge.PutEve return nil, m.err } m.events = append(m.events, input) + if m.failedEntryCount > 0 { + errCode := "InternalError" + errMsg := "simulated partial failure" + return &eventbridge.PutEventsOutput{ + FailedEntryCount: m.failedEntryCount, + Entries: []ebTypes.PutEventsResultEntry{ + {ErrorCode: &errCode, ErrorMessage: &errMsg}, + }, + }, nil + } return &eventbridge.PutEventsOutput{}, nil } diff --git a/internal/lambda/orchestrator.go b/internal/lambda/orchestrator.go index 5ba53ab..3035d90 100644 --- a/internal/lambda/orchestrator.go +++ b/internal/lambda/orchestrator.go @@ -263,15 +263,19 @@ func RemapPerPeriodSensors(sensors map[string]map[string]interface{}, date strin if compact != date { suffixes = append(suffixes, "#"+compact) } + additions := make(map[string]map[string]interface{}) for key, data := range sensors { for _, suffix := range suffixes { if strings.HasSuffix(key, suffix) { base := strings.TrimSuffix(key, suffix) - sensors[base] = data + additions[base] = data break } } } + for k, v := range additions { + sensors[k] = v + } } // handleTriggerExhausted publishes RETRY_EXHAUSTED when trigger retries are @@ -357,13 +361,12 @@ func capturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID RemapPerPeriodSensors(sensors, date) - // Build baseline from post-run rule keys. + // Build baseline from post-run rule keys, namespaced by rule key + // to prevent field name collisions between different sensors. baseline := make(map[string]interface{}) for _, rule := range cfg.PostRun.Rules { if data, ok := sensors[rule.Key]; ok { - for k, v := range data { - baseline[k] = v - } + baseline[rule.Key] = data } } @@ -498,7 +501,7 @@ func InjectDateArgs(tc *types.TriggerConfig, date string) { if hourPart != "" { payload["par_hour"] = hourPart } - b, _ := json.Marshal(payload) + b, _ := json.Marshal(payload) // json.Marshal is infallible for map[string]string (no channels, funcs, or complex types) tc.HTTP.Body = string(b) } } diff --git a/internal/lambda/orchestrator_unit_test.go b/internal/lambda/orchestrator_unit_test.go index d6ce194..cd5d343 100644 --- a/internal/lambda/orchestrator_unit_test.go +++ b/internal/lambda/orchestrator_unit_test.go @@ -93,6 +93,49 @@ func TestInjectDateArgs(t *testing.T) { }) } +// --------------------------------------------------------------------------- +// BUG-2 characterization: RemapPerPeriodSensors map mutation during range +// --------------------------------------------------------------------------- + +func TestRemapPerPeriodSensors_MultipleSuffixes_MapMutation(t *testing.T) { + // BUG-2 characterization: adding keys during range iteration. + // With Go's map iteration, newly inserted keys may or may not be visited. + // This test documents the nondeterministic behavior. + sensors := map[string]map[string]interface{}{ + "hourly-status#2026-03-13": {"count": float64(10)}, + "daily-check#2026-03-13": {"count": float64(20)}, + "weekly-scan#20260313": {"count": float64(30)}, + } + lambda.RemapPerPeriodSensors(sensors, "2026-03-13") + // All base keys should be present + assert.NotNil(t, sensors["hourly-status"], "hourly-status base key should exist") + assert.NotNil(t, sensors["daily-check"], "daily-check base key should exist") + assert.NotNil(t, sensors["weekly-scan"], "weekly-scan base key should exist") +} + +func TestRemapPerPeriodSensors_StagedMerge_NoCrossContamination(t *testing.T) { + // Verify staged merge doesn't allow newly-added base keys to match + // as suffixed keys in the same iteration. + sensors := map[string]map[string]interface{}{ + "hourly-status#2026-03-13": {"count": float64(10)}, + } + lambda.RemapPerPeriodSensors(sensors, "2026-03-13") + assert.NotNil(t, sensors["hourly-status"]) + assert.Equal(t, float64(10), sensors["hourly-status"]["count"]) + // Original suffixed key should still exist + assert.NotNil(t, sensors["hourly-status#2026-03-13"]) +} + +// --------------------------------------------------------------------------- +// BUG-10 characterization: baseline flattening collision +// --------------------------------------------------------------------------- + +func TestExtractFloat_ZeroValueIndistinguishableFromMissing(t *testing.T) { + // BUG-1 characterization: ExtractFloat returns 0 for both zero and missing. + assert.Equal(t, float64(0), lambda.ExtractFloat(map[string]interface{}{"count": float64(0)}, "count")) + assert.Equal(t, float64(0), lambda.ExtractFloat(map[string]interface{}{}, "count")) +} + // --------------------------------------------------------------------------- // RemapPerPeriodSensors — table-driven // --------------------------------------------------------------------------- diff --git a/internal/lambda/postrun.go b/internal/lambda/postrun.go index 7b8beae..727badd 100644 --- a/internal/lambda/postrun.go +++ b/internal/lambda/postrun.go @@ -3,7 +3,6 @@ package lambda import ( "context" "fmt" - "math" "strings" "github.com/dwsmith1983/interlock/internal/validation" @@ -61,7 +60,7 @@ func handlePostRunSensorEvent(ctx context.Context, d *Deps, cfg *types.PipelineC case types.TriggerStatusCompleted: // Job completed — full post-run evaluation with baseline comparison. - return handlePostRunCompleted(ctx, d, cfg, pipelineID, scheduleID, date, sensorData) + return handlePostRunCompleted(ctx, d, cfg, pipelineID, scheduleID, date, sensorKey, sensorData) default: // FAILED_FINAL or unknown — skip. @@ -82,20 +81,33 @@ func handlePostRunInflight(ctx context.Context, d *Deps, cfg *types.PipelineConf return nil // No baseline yet — job hasn't completed once. } + // Find matching post-run rule for this sensor key. + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break + } + } + if ruleBaseline == nil { + return nil // No baseline for this rule (stale or first run). + } + driftField := resolveDriftField(cfg.PostRun) - prevCount := ExtractFloat(baseline, driftField) - currCount := ExtractFloat(sensorData, driftField) threshold := 0.0 if cfg.PostRun.DriftThreshold != nil { threshold = *cfg.PostRun.DriftThreshold } - if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold { + dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { if err := publishEvent(ctx, d, string(types.EventPostRunDriftInflight), pipelineID, scheduleID, date, - fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, prevCount, currCount), + fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, dr.Previous, dr.Current), map[string]interface{}{ - "previousCount": prevCount, - "currentCount": currCount, - "delta": currCount - prevCount, + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, "driftThreshold": threshold, "driftField": driftField, "sensorKey": sensorKey, @@ -110,7 +122,7 @@ func handlePostRunInflight(ctx context.Context, d *Deps, cfg *types.PipelineConf // handlePostRunCompleted evaluates post-run rules after the job has completed. // Compares sensor values against the date-scoped baseline and triggers a rerun // if drift is detected. -func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date string, sensorData map[string]interface{}) error { +func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date, sensorKey string, sensorData map[string]interface{}) error { // Read baseline captured at trigger completion. baselineKey := "postrun-baseline#" + date baseline, err := d.Store.GetSensorData(ctx, pipelineID, baselineKey) @@ -120,44 +132,56 @@ func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineCon // Check for data drift if baseline exists. if baseline != nil { - driftField := resolveDriftField(cfg.PostRun) - prevCount := ExtractFloat(baseline, driftField) - currCount := ExtractFloat(sensorData, driftField) - threshold := 0.0 - if cfg.PostRun.DriftThreshold != nil { - threshold = *cfg.PostRun.DriftThreshold - } - if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold { - delta := currCount - prevCount - if err := publishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date, - fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, prevCount, currCount), - map[string]interface{}{ - "previousCount": prevCount, - "currentCount": currCount, - "delta": delta, - "driftThreshold": threshold, - "driftField": driftField, - "source": "post-run-stream", - }); err != nil { - d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDrift, "error", err) + // Find matching post-run rule for this sensor key. + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break } + } - // Trigger rerun via the existing circuit breaker path only if the - // execution date is not excluded by the pipeline's calendar config. - if isExcludedDate(cfg, date) { - if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date, - fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { - d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + if ruleBaseline != nil { + driftField := resolveDriftField(cfg.PostRun) + threshold := 0.0 + if cfg.PostRun.DriftThreshold != nil { + threshold = *cfg.PostRun.DriftThreshold + } + dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { + if err := publishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date, + fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, dr.Previous, dr.Current), + map[string]interface{}{ + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, + "driftThreshold": threshold, + "driftField": driftField, + "sensorKey": sensorKey, + "source": "post-run-stream", + }); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDrift, "error", err) } - d.Logger.InfoContext(ctx, "post-run drift rerun skipped: execution date excluded by calendar", - "pipelineId", pipelineID, "date", date) - } else { - if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, scheduleID, date, "data-drift"); writeErr != nil { - d.Logger.WarnContext(ctx, "failed to write rerun request on post-run drift", - "pipelineId", pipelineID, "error", writeErr) + + // Trigger rerun via the existing circuit breaker path only if the + // execution date is not excluded by the pipeline's calendar config. + if isExcludedDate(cfg, date) { + if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date, + fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + } + d.Logger.InfoContext(ctx, "post-run drift rerun skipped: execution date excluded by calendar", + "pipelineId", pipelineID, "date", date) + } else { + if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, scheduleID, date, "data-drift"); writeErr != nil { + d.Logger.WarnContext(ctx, "failed to write rerun request on post-run drift", + "pipelineId", pipelineID, "error", writeErr) + } } + return nil } - return nil } } diff --git a/internal/lambda/rerun.go b/internal/lambda/rerun.go index 704b39f..cb94e76 100644 --- a/internal/lambda/rerun.go +++ b/internal/lambda/rerun.go @@ -34,11 +34,10 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even return nil } - // Dry-run pipelines never start real executions. + // Dry-run pipelines evaluate all checks but publish observation events + // instead of executing side effects. if cfg.DryRun { - d.Logger.Info("dry-run: skipping rerun request", - "pipelineId", pipelineID, "schedule", schedule, "date", date) - return nil + return handleDryRunRerunRequest(ctx, d, cfg, pipelineID, schedule, date, record) } // --- Calendar exclusion check (execution date) --- @@ -132,34 +131,38 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even return nil } - // --- Acceptance: write rerun record FIRST (before lock reset) --- - if _, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, reason, ""); err != nil { - return fmt.Errorf("write rerun for %q: %w", pipelineID, err) - } - - // Delete date-scoped postrun-baseline so re-run captures fresh baseline. - if cfg.PostRun != nil { - if err := d.Store.DeleteSensor(ctx, pipelineID, "postrun-baseline#"+date); err != nil { - d.Logger.Warn("failed to delete postrun-baseline sensor", "error", err, "pipeline", pipelineID, "date", date) - } - } - - // Atomically reset the trigger lock for the new execution. + // --- Acceptance: acquire lock FIRST (before writing rerun) --- acquired, err := d.Store.ResetTriggerLock(ctx, pipelineID, schedule, date, ResolveTriggerLockTTL()) if err != nil { return fmt.Errorf("reset trigger lock for %q: %w", pipelineID, err) } if !acquired { if pubErr := publishEvent(ctx, d, string(types.EventInfraFailure), pipelineID, schedule, date, - fmt.Sprintf("lock reset failed for rerun of %s, orphaned rerun record", pipelineID)); pubErr != nil { + fmt.Sprintf("lock reset failed for rerun of %s", pipelineID)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "error", pubErr) } - d.Logger.Warn("failed to reset trigger lock, orphaned rerun record", + d.Logger.Warn("failed to reset trigger lock for rerun", "pipelineId", pipelineID, "schedule", schedule, "date", date) return nil } - // Publish acceptance event only after lock atomicity is confirmed. + // Delete date-scoped postrun-baseline so re-run captures fresh baseline. + if cfg.PostRun != nil { + if err := d.Store.DeleteSensor(ctx, pipelineID, "postrun-baseline#"+date); err != nil { + d.Logger.Warn("failed to delete postrun-baseline sensor", "error", err, "pipeline", pipelineID, "date", date) + } + } + + // Write rerun record AFTER lock is confirmed. + if _, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, reason, ""); err != nil { + // Lock acquired but write failed — release lock to avoid deadlock. + if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil { + d.Logger.Warn("failed to release lock after rerun write failure", "error", relErr) + } + return fmt.Errorf("write rerun for %q: %w", pipelineID, err) + } + + // Publish acceptance event only after lock and rerun record confirmed. if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, types.JobEventRerunAccepted, "", 0, ""); err != nil { d.Logger.Warn("failed to write rerun-accepted joblog", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) @@ -206,11 +209,10 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, return nil } - // Dry-run pipelines never start real executions. + // Dry-run pipelines evaluate retry logic but publish observation events + // instead of executing side effects. if cfg.DryRun { - d.Logger.Info("dry-run: skipping job failure rerun", - "pipelineId", pipelineID, "schedule", schedule, "date", date) - return nil + return handleDryRunJobFailure(ctx, d, cfg, pipelineID, schedule, date) } maxRetries := cfg.Job.MaxRetries @@ -301,6 +303,171 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, return nil } +// handleDryRunRerunRequest evaluates all rerun checks (calendar, limit, +// circuit breaker) and publishes observation events instead of executing +// side effects. Mirrors the production handleRerunRequest logic. +func handleDryRunRerunRequest(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string, record events.DynamoDBEventRecord) error { + // Calendar exclusion check. + if isExcludedDate(cfg, date) { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: execution date %s excluded by calendar", pipelineID, date), + map[string]interface{}{ + "reason": "excluded by calendar", + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + + // Extract reason from stream record NewImage. Default to "manual". + reason := "manual" + if img := record.Change.NewImage; img != nil { + if r, ok := img["reason"]; ok && r.DataType() == events.DataTypeString { + if v := r.String(); v != "" { + reason = v + } + } + } + + // Rerun limit check. + var budget int + var sources []string + switch reason { + case "data-drift", "late-data": + budget = types.IntOrDefault(cfg.Job.MaxDriftReruns, 1) + sources = []string{"data-drift", "late-data"} + default: + budget = types.IntOrDefault(cfg.Job.MaxManualReruns, 1) + sources = []string{reason} + } + + count, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, sources) + if err != nil { + return fmt.Errorf("dry-run: count reruns by source for %q: %w", pipelineID, err) + } + + if count >= budget { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: limit exceeded (%d/%d)", pipelineID, count, budget), + map[string]interface{}{ + "reason": "limit exceeded", + "rerunCount": count, + "budget": budget, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + + // Circuit breaker (sensor freshness). + cbStatus := "passed" + job, err := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if err != nil { + return fmt.Errorf("dry-run: get latest job event for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + if job == nil { + cbStatus = "skipped (no job history)" + } else if job.Event == types.JobEventSuccess { + fresh, freshErr := checkSensorFreshness(ctx, d, pipelineID, job.SK) + if freshErr != nil { + return fmt.Errorf("dry-run: check sensor freshness for %q: %w", pipelineID, freshErr) + } + if !fresh { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: previous run succeeded and no sensor data has changed", pipelineID), + map[string]interface{}{ + "reason": "circuit breaker", + "circuitBreaker": "rejected", + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + } + + // All checks pass — publish would-rerun event. + if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRerun), pipelineID, schedule, date, + fmt.Sprintf("dry-run: would rerun %s (reason: %s)", pipelineID, reason), + map[string]interface{}{ + "reason": reason, + "circuitBreaker": cbStatus, + "rerunCount": count, + "budget": budget, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRerun, "error", pubErr) + } + + d.Logger.Info("dry-run: would rerun", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "reason", reason) + return nil +} + +// handleDryRunJobFailure evaluates retry logic for a dry-run pipeline and +// publishes observation events instead of executing side effects. +func handleDryRunJobFailure(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string) error { + maxRetries := cfg.Job.MaxRetries + + // Read latest job event for failure category (read-only). + latestJob, jobErr := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if jobErr != nil { + d.Logger.WarnContext(ctx, "dry-run: could not read latest job event for failure category", + "pipelineId", pipelineID, "error", jobErr) + } + if latestJob != nil { + if types.FailureCategory(latestJob.Category) == types.FailurePermanent { + maxRetries = types.IntOrDefault(cfg.Job.MaxCodeRetries, 1) + } + // TRANSIENT, TIMEOUT, or empty → use cfg.Job.MaxRetries (already set). + } + + rerunCount, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, []string{"job-fail-retry"}) + if err != nil { + return fmt.Errorf("dry-run: count reruns for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + if rerunCount >= maxRetries { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + fmt.Sprintf("dry-run: retry limit reached (%d/%d) for %s", rerunCount, maxRetries, pipelineID), + map[string]interface{}{ + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr) + } + return nil + } + + // Calendar exclusion check. + if isExcludedDate(cfg, date) { + if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + fmt.Sprintf("dry-run: retry skipped for %s: execution date %s excluded by calendar", pipelineID, date), + map[string]interface{}{ + "reason": "excluded by calendar", + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr) + } + return nil + } + + // Under budget — publish would-retry event. + if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRetry), pipelineID, schedule, date, + fmt.Sprintf("dry-run: would retry %s (%d/%d)", pipelineID, rerunCount, maxRetries), + map[string]interface{}{ + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRetry, "error", pubErr) + } + + d.Logger.Info("dry-run: would retry", + "pipelineId", pipelineID, "schedule", schedule, "date", date, + "retries", rerunCount, "maxRetries", maxRetries) + return nil +} + // checkSensorFreshness determines whether any sensor data has been updated // after the given job completed. The job timestamp is extracted from the job // SK (format: JOB#schedule#date#). Returns true if data has @@ -350,6 +517,13 @@ func checkSensorFreshness(ctx context.Context, d *Deps, pipelineID, jobSK string continue } + // Normalize epoch: if ts looks like seconds (< 1e12), convert to millis. + // Epoch millis won't be < 1e12 until ~2001, and epoch seconds won't + // exceed 1e12 until ~33658 CE. + if ts > 0 && ts < 1e12 { + ts *= 1000 + } + if ts > jobTimestamp { return true, nil // Data changed after job — allow rerun. } diff --git a/internal/lambda/sla_monitor.go b/internal/lambda/sla_monitor.go index e4164c8..5093fa8 100644 --- a/internal/lambda/sla_monitor.go +++ b/internal/lambda/sla_monitor.go @@ -256,28 +256,8 @@ func handleSLASchedule(ctx context.Context, d *Deps, input SLAMonitorInput) (SLA return calc, nil } - for _, alert := range []struct { - suffix string - alertType string - timestamp string - }{ - {"warning", "SLA_WARNING", calc.WarningAt}, - {"breach", "SLA_BREACH", calc.BreachAt}, - } { - name := slaScheduleName(input.PipelineID, input.ScheduleID, input.Date, alert.suffix) - payload := SLAMonitorInput{ - Mode: "fire-alert", - PipelineID: input.PipelineID, - ScheduleID: input.ScheduleID, - Date: input.Date, - AlertType: alert.alertType, - } - if alert.alertType == "SLA_WARNING" { - payload.BreachAt = calc.BreachAt - } - if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil { - return SLAMonitorOutput{}, fmt.Errorf("create %s schedule: %w", alert.suffix, err) - } + if err := createSLASchedules(ctx, d, input.PipelineID, input.ScheduleID, input.Date, calc, false); err != nil { + return SLAMonitorOutput{}, err } d.Logger.InfoContext(ctx, "scheduled SLA alerts", @@ -342,17 +322,31 @@ func handleSLACancel(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMo } } - // Always publish the verdict. For first runs, Scheduler entries would have - // fired WARNING/BREACH but are now deleted — MET is the only new signal. - // For reruns, the Scheduler entries were already deleted by the first run's - // cancel, so this publish is the only path to a notification. + // Only publish a verdict if the pipeline was actually triggered. + // If no trigger record exists, the pipeline never ran — publishing SLA_MET + // would be misleading since the SLA wasn't "met" (nothing executed). + publish := true + if d.Store != nil { + tr, err := d.Store.GetTrigger(ctx, input.PipelineID, input.ScheduleID, input.Date) + if err != nil { + d.Logger.WarnContext(ctx, "trigger lookup failed in cancel, proceeding with verdict", + "pipeline", input.PipelineID, "error", err) + } else if tr == nil { + d.Logger.InfoContext(ctx, "skipping SLA verdict — pipeline was never triggered", + "pipeline", input.PipelineID, "date", input.Date, "alertType", alertType) + publish = false + } + } + d.Logger.InfoContext(ctx, "cancelled SLA schedules", "pipeline", input.PipelineID, "alertType", alertType, ) - if err := publishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date, - fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil { - return SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err) + if publish { + if err := publishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil { + return SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err) + } } return SLAMonitorOutput{ @@ -400,6 +394,42 @@ func createOneTimeSchedule(ctx context.Context, d *Deps, name, timestamp string, return nil } +// createSLASchedules creates warning and breach one-time schedules. +// Returns an error on the first schedule creation failure. If onConflictSkip +// is true, ConflictException errors are silently skipped (idempotent retries). +func createSLASchedules(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, calc SLAMonitorOutput, onConflictSkip bool) error { + for _, alert := range []struct { + suffix string + alertType string + timestamp string + }{ + {"warning", "SLA_WARNING", calc.WarningAt}, + {"breach", "SLA_BREACH", calc.BreachAt}, + } { + name := slaScheduleName(pipelineID, scheduleID, date, alert.suffix) + payload := SLAMonitorInput{ + Mode: "fire-alert", + PipelineID: pipelineID, + ScheduleID: scheduleID, + Date: date, + AlertType: alert.alertType, + } + if alert.alertType == "SLA_WARNING" { + payload.BreachAt = calc.BreachAt + } + if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil { + if onConflictSkip { + var conflict *schedulerTypes.ConflictException + if errors.As(err, &conflict) { + continue + } + } + return fmt.Errorf("create %s schedule: %w", alert.suffix, err) + } + } + return nil +} + // handleSLAReconcile calculates deadlines and fires any alerts for deadlines // that have already passed. Fallback for environments without EventBridge // Scheduler configured. @@ -423,13 +453,17 @@ func handleSLAReconcile(ctx context.Context, d *Deps, input SLAMonitorInput) (SL var alertType string switch { case now.After(breachAt) || now.Equal(breachAt): - _ = publishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date, - fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail) + if err := publishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_BREACH", "error", err) + } alertType = "SLA_BREACH" case now.After(warningAt) || now.Equal(warningAt): // Past warning but before breach — fire warning only - _ = publishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date, - fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail) + if err := publishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_WARNING", "error", err) + } alertType = "SLA_WARNING" default: alertType = "SLA_MET" diff --git a/internal/lambda/sla_monitor_test.go b/internal/lambda/sla_monitor_test.go index c8228c1..10c969c 100644 --- a/internal/lambda/sla_monitor_test.go +++ b/internal/lambda/sla_monitor_test.go @@ -520,6 +520,46 @@ func TestSLAMonitor_Cancel_RecalculatesWhenTimesNotProvided(t *testing.T) { } } +// --------------------------------------------------------------------------- +// BUG-5 characterization: SLA_MET published when pipeline never ran +// --------------------------------------------------------------------------- + +func TestSLAMonitor_Cancel_NeverTriggered_PublishesMet(t *testing.T) { + // BUG-5 characterization: SLA_MET fires even with no trigger/job records. + // Pipeline was never started — there should be no SLA verdict at all. + sched := &mockScheduler{} + eb := &mockEventBridge{} + mock := newMockDDB() + s := &store.Store{ + Client: mock, + ControlTable: testControlTable, + JobLogTable: "joblog", + RerunTable: "rerun", + } + d := &lambda.Deps{ + Store: s, + Scheduler: sched, + SchedulerGroupName: "interlock-sla", + EventBridge: eb, + EventBusName: "test-bus", + Logger: slog.Default(), + } + + // No trigger, no joblog — pipeline was never started + out, err := lambda.HandleSLAMonitor(context.Background(), d, lambda.SLAMonitorInput{ + Mode: "cancel", + PipelineID: "never-ran", + ScheduleID: "daily", + Date: "2026-03-13", + WarningAt: "2099-12-31T23:45:00Z", + BreachAt: "2099-12-31T23:59:00Z", + }) + require.NoError(t, err) + // BUG-5 fixed: AlertType still set for SFN flow, but no EventBridge event published + assert.Equal(t, "SLA_MET", out.AlertType, "AlertType should still be set for SFN state machine") + assert.Empty(t, eb.events, "no EventBridge events should be published when pipeline was never triggered") +} + // --------------------------------------------------------------------------- // Fire-alert tests // --------------------------------------------------------------------------- diff --git a/internal/lambda/stream_router.go b/internal/lambda/stream_router.go index 66bedb5..0344928 100644 --- a/internal/lambda/stream_router.go +++ b/internal/lambda/stream_router.go @@ -50,18 +50,23 @@ func getValidatedConfig(ctx context.Context, d *Deps, pipelineID string) (*types } // HandleStreamEvent processes a DynamoDB stream event, routing each record -// to the appropriate handler based on the SK prefix. Errors are logged but -// do not fail the batch (returns nil) to prevent infinite retries. -func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) error { +// to the appropriate handler based on the SK prefix. Per-record errors are +// collected as BatchItemFailures so the Lambda runtime can use DynamoDB's +// ReportBatchItemFailures to retry only the failed records. +func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) (events.DynamoDBEventResponse, error) { + var resp events.DynamoDBEventResponse for i := range event.Records { if err := handleRecord(ctx, d, event.Records[i]); err != nil { d.Logger.Error("stream record error", "error", err, "eventID", event.Records[i].EventID, ) + resp.BatchItemFailures = append(resp.BatchItemFailures, events.DynamoDBBatchItemFailure{ + ItemIdentifier: event.Records[i].EventID, + }) } } - return nil + return resp, nil } // handleRecord extracts PK/SK and routes to the appropriate handler. diff --git a/internal/lambda/stream_router_test.go b/internal/lambda/stream_router_test.go index 09ab196..d1b8508 100644 --- a/internal/lambda/stream_router_test.go +++ b/internal/lambda/stream_router_test.go @@ -141,7 +141,7 @@ func TestStreamRouter_SensorMatch_StartsSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -182,7 +182,7 @@ func TestStreamRouter_SensorPrefixMatch_PerPeriodKey(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -207,7 +207,7 @@ func TestStreamRouter_SensorNoMatch_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -231,7 +231,7 @@ func TestStreamRouter_SensorMatch_LockHeld_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -256,7 +256,7 @@ func TestStreamRouter_CalendarExcluded_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -274,7 +274,7 @@ func TestStreamRouter_NoPipelineConfig_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -301,7 +301,7 @@ func TestStreamRouter_ConfigChange_InvalidatesCache(t *testing.T) { sensorRecord := makeSensorRecord("gold-revenue", "upstream-complete", map[string]events.DynamoDBAttributeValue{ "status": events.NewStringAttribute("ready"), }) - err := lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ + _, err := lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ Records: []events.DynamoDBEventRecord{sensorRecord}, }) require.NoError(t, err) @@ -316,13 +316,13 @@ func TestStreamRouter_ConfigChange_InvalidatesCache(t *testing.T) { // Send a CONFIG change event to invalidate the cache. configRecord := makeConfigRecord("gold-revenue") - err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ + _, err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ Records: []events.DynamoDBEventRecord{configRecord}, }) require.NoError(t, err) // Now send the sensor event again — should trigger SFN with the updated config. - err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ + _, err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{ Records: []events.DynamoDBEventRecord{sensorRecord}, }) require.NoError(t, err) @@ -401,7 +401,7 @@ func TestStreamRouter_JobFail_UnderRetryLimit_Reruns(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have started a new SFN execution for the rerun. @@ -432,7 +432,7 @@ func TestStreamRouter_JobFail_OverRetryLimit_Alerts(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution should be started. @@ -457,7 +457,7 @@ func TestStreamRouter_JobSuccess_PublishesEvent(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution for success. @@ -484,7 +484,7 @@ func TestStreamRouter_JobTimeout_TreatedAsFailure(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventTimeout) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -518,7 +518,7 @@ func TestStreamRouter_JobFail_DriftRerunsIgnored(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -534,7 +534,7 @@ func TestStreamRouter_JobFail_NoConfig_Skips(t *testing.T) { record := makeJobRecord("unknown-pipeline", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -559,7 +559,7 @@ func TestStreamRouter_TriggerValueMismatch_NoSFN(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -586,7 +586,7 @@ func TestStreamRouter_SensorMatch_RecordsFirstSensorArrival(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Verify SFN was started (lock acquired). @@ -628,7 +628,7 @@ func TestStreamRouter_SensorMatch_FirstArrivalIdempotent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Verify original arrival time is preserved (not overwritten). @@ -673,7 +673,7 @@ func TestStreamRouter_LateDataArrival_CompletedSuccess(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution (lock held). @@ -717,7 +717,7 @@ func TestStreamRouter_LateDataArrival_WritesRerunRequest(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have published LATE_DATA_ARRIVAL event (existing behavior). @@ -756,7 +756,7 @@ func TestStreamRouter_LateDataArrival_StillRunning_Silent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -797,7 +797,7 @@ func TestStreamRouter_LateDataArrival_CompletedFailed_Silent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No late data event — pipeline didn't succeed. @@ -902,6 +902,27 @@ func seedJobEvent(mock *mockDDB, timestamp, event string) { } // seedSensor inserts a sensor record with a data map into the mock control table. +// toAttributeValue converts a Go value to a DynamoDB attribute value, supporting +// nested maps for namespaced baseline format. +func toAttributeValue(v interface{}) ddbtypes.AttributeValue { + switch val := v.(type) { + case string: + return &ddbtypes.AttributeValueMemberS{Value: val} + case float64: + return &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%g", val)} + case int64: + return &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", val)} + case map[string]interface{}: + nested := make(map[string]ddbtypes.AttributeValue, len(val)) + for nk, nv := range val { + nested[nk] = toAttributeValue(nv) + } + return &ddbtypes.AttributeValueMemberM{Value: nested} + default: + return &ddbtypes.AttributeValueMemberS{Value: fmt.Sprintf("%v", val)} + } +} + func seedSensor(mock *mockDDB, pipelineID, sensorKey string, data map[string]interface{}) { item := map[string]ddbtypes.AttributeValue{ "PK": &ddbtypes.AttributeValueMemberS{Value: types.PipelinePK(pipelineID)}, @@ -910,14 +931,7 @@ func seedSensor(mock *mockDDB, pipelineID, sensorKey string, data map[string]int if data != nil { dataAV := make(map[string]ddbtypes.AttributeValue, len(data)) for k, v := range data { - switch val := v.(type) { - case string: - dataAV[k] = &ddbtypes.AttributeValueMemberS{Value: val} - case float64: - dataAV[k] = &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%g", val)} - case int64: - dataAV[k] = &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", val)} - } + dataAV[k] = toAttributeValue(v) } item["data"] = &ddbtypes.AttributeValueMemberM{Value: dataAV} } @@ -938,7 +952,7 @@ func TestStreamRouter_RerunRequest_FailedJob_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have started a new SFN execution. @@ -968,7 +982,7 @@ func TestStreamRouter_RerunRequest_SuccessDataChanged_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Data changed — SFN should start. @@ -984,20 +998,22 @@ func TestStreamRouter_RerunRequest_SuccessDataUnchanged_Rejected(t *testing.T) { cfg := testJobConfig() seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") - // Seed a successful job event with timestamp 2000000. - seedJobEvent(mock, "2000000", types.JobEventSuccess) + // Use millis-range timestamps so epoch normalization (ts < 1e12 → ts*1000) + // does not distort the comparison. + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) - // Seed a sensor with updatedAt BEFORE the job timestamp. + // Seed a sensor with updatedAt BEFORE the job timestamp (both in millis). seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ - "updatedAt": float64(1000000), // older than job timestamp + "updatedAt": float64(1000000000000), // older than job timestamp "status": "ready", }) record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution — data unchanged. @@ -1026,7 +1042,7 @@ func TestStreamRouter_RerunRequest_InfraExhausted_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should have started a new SFN execution. @@ -1036,6 +1052,37 @@ func TestStreamRouter_RerunRequest_InfraExhausted_Allowed(t *testing.T) { assert.Contains(t, *sfnMock.executions[0].Name, "manual-rerun") } +func TestStreamRouter_RerunRequest_SensorEpochSeconds_Normalized(t *testing.T) { + mock := newMockDDB() + d, sfnMock, _ := testDeps(mock) + + cfg := testJobConfig() + seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") + + // Seed a successful job with timestamp in millis: 2000000000000 (year ~2033). + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) + + // Seed sensor with updatedAt in SECONDS: 2000000001 (1 second after job). + // Without normalization, 2000000001 < 2000000000000 → rejected. + // With normalization, 2000000001000 > 2000000000000 → allowed. + seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ + "updatedAt": float64(2000000001), // seconds epoch + "status": "ready", + }) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + _ = resp + + sfnMock.mu.Lock() + defer sfnMock.mu.Unlock() + require.Len(t, sfnMock.executions, 1, "sensor with epoch-seconds updatedAt should be normalized and allow rerun") +} + // --------------------------------------------------------------------------- // handleRecord routing: unknown SK prefix // --------------------------------------------------------------------------- @@ -1056,7 +1103,7 @@ func TestStreamRouter_UnknownSKPrefix_Silent(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1072,15 +1119,18 @@ func TestStreamRouter_MissingPKOrSK_LogsError(t *testing.T) { mock := newMockDDB() d, _, _ := testDeps(mock) - // Record with no keys at all — should log error but HandleStreamEvent returns nil. + // Record with no keys at all — handleRecord returns error, collected as batch failure. record := events.DynamoDBEventRecord{ + EventID: "missing-keys-1", EventName: "INSERT", Change: events.DynamoDBStreamRecord{}, } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent always returns nil; errors are logged") + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err, "HandleStreamEvent never returns a top-level error") + require.Len(t, resp.BatchItemFailures, 1) + assert.Equal(t, "missing-keys-1", resp.BatchItemFailures[0].ItemIdentifier) } // --------------------------------------------------------------------------- @@ -1105,7 +1155,7 @@ func TestLateData_TriggerNil(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No late data event — GetTrigger returned nil (trigger row doesn't match COMPLETED). @@ -1139,7 +1189,7 @@ func TestSensor_NoTriggerCondition(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1160,7 +1210,7 @@ func TestSensor_SensorKeyMismatch(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1183,9 +1233,10 @@ func TestSensor_StartSFNError(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // HandleStreamEvent logs errors but always returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent swallows errors") + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1, "SFN error should produce a batch item failure") // SFN was called but failed. sfnMock.mu.Lock() @@ -1209,8 +1260,9 @@ func TestSensor_StartSFNError_ReleasesLock(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent swallows errors") + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1, "SFN error should produce a batch item failure") // The trigger lock must have been released after SFN failure. // Schedule ID for stream-triggered pipelines is "stream". @@ -1237,7 +1289,7 @@ func TestSensor_PerHour_DateOnly(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1262,7 +1314,7 @@ func TestSensor_PerHour_NoDate(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1290,7 +1342,7 @@ func TestRerun_NoJobRecord_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1307,7 +1359,7 @@ func TestRerun_NoConfig_Skips(t *testing.T) { record := makeDefaultRerunRequestRecord() // uses "gold-revenue" event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1337,9 +1389,10 @@ func TestRerun_ParseSKError(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // Error is logged, HandleStreamEvent returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } func TestRerun_TimeoutJob_Allowed(t *testing.T) { @@ -1356,7 +1409,7 @@ func TestRerun_TimeoutJob_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1379,7 +1432,7 @@ func TestRerun_UnknownJobEvent_Allowed(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1404,7 +1457,7 @@ func TestRerun_StartSFNError(t *testing.T) { event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} // Error is logged, HandleStreamEvent still returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1430,7 +1483,7 @@ func TestSensorFreshness_NoSensors(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No sensors → can't prove unchanged → allow rerun. @@ -1458,7 +1511,7 @@ func TestSensorFreshness_NoUpdatedAtField(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No updatedAt → can't prove unchanged → allow rerun. @@ -1486,7 +1539,7 @@ func TestSensorFreshness_FreshSensor_Float(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1500,19 +1553,21 @@ func TestSensorFreshness_StaleSensor_Float(t *testing.T) { cfg := testJobConfig() seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") - // Seed a successful job event with timestamp 2000000. - seedJobEvent(mock, "2000000", types.JobEventSuccess) + // Use millis-range timestamps so epoch normalization (ts < 1e12 → ts*1000) + // does not distort the comparison. + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) - // Seed a sensor with updatedAt as float64 < jobTimestamp. + // Seed a sensor with updatedAt as float64 < jobTimestamp (both in millis). seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ - "updatedAt": float64(1000000), + "updatedAt": float64(1000000000000), }) record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1544,7 +1599,7 @@ func TestSensorFreshness_FreshSensor_String(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1576,7 +1631,7 @@ func TestSensorFreshness_InvalidJobSK(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Invalid job SK → can't parse timestamp → allow to be safe. @@ -1608,7 +1663,7 @@ func TestSensorFreshness_InvalidTimestamp(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Non-numeric timestamp → allow to be safe. @@ -1633,7 +1688,7 @@ func TestJobLog_InfraExhaustedEvent(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventInfraTriggerExhausted) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1656,7 +1711,7 @@ func TestJobLog_OtherEvent(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventRerunAccepted) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1691,9 +1746,10 @@ func TestJobLog_ParseSKError(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // Error is logged, HandleStreamEvent returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } func TestJobLog_MissingEventAttribute(t *testing.T) { @@ -1722,7 +1778,7 @@ func TestJobLog_MissingEventAttribute(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Missing event attribute → logged as warning, no action. @@ -1749,7 +1805,7 @@ func TestJobSuccess_PublishesJobCompleted(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) ebMock.mu.Lock() @@ -1777,7 +1833,7 @@ func TestJobFailure_NoConfig(t *testing.T) { record := makeJobRecord("unknown-pipeline", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1811,7 +1867,7 @@ func TestBuildSFNConfig_NoPostRunFields(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1847,7 +1903,7 @@ func TestBuildSFNConfig_CustomTimings(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1879,7 +1935,7 @@ func TestBuildSFNConfig_WithSLA(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1910,7 +1966,7 @@ func TestBuildSFNConfig_JobPollWindowDefault(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1937,7 +1993,7 @@ func TestBuildSFNConfig_JobPollWindowOverride(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -1964,7 +2020,7 @@ func TestBuildSFNConfig_JobPollWindowZeroUsesDefault(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2008,7 +2064,7 @@ func TestExtractSensorData_DataMapUnwrap(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Trigger should fire because "data" map was unwrapped, exposing "status" = "ready". @@ -2030,7 +2086,7 @@ func TestExtractSensorData_NoDataMap(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2064,7 +2120,7 @@ func TestExtractSensorData_SkipsPKSKTTL(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // PK, SK, ttl should be stripped; "status" remains for trigger evaluation. @@ -2089,7 +2145,7 @@ func TestConvertAV_String(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2125,7 +2181,7 @@ func TestConvertAV_Number(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2161,7 +2217,7 @@ func TestConvertAV_Bool(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2198,7 +2254,7 @@ func TestConvertAV_Map(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // The data map gets unwrapped; "status" should be accessible at top level. @@ -2236,7 +2292,7 @@ func TestConvertAV_List(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2270,7 +2326,7 @@ func TestConvertAV_Null(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2316,7 +2372,7 @@ func TestResolveScheduleID_StreamTriggered(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2341,7 +2397,7 @@ func TestResolveScheduleID_CronTriggered(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2368,12 +2424,13 @@ func TestPublishEvent_EventBridgeError(t *testing.T) { ebMock.err = fmt.Errorf("EventBridge throttled") // JobSuccess publishes an event — if EventBridge fails, handleJobSuccess returns error, - // but HandleStreamEvent logs it and returns nil. + // collected as a batch item failure. record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) - require.NoError(t, err, "HandleStreamEvent swallows errors") + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } func TestPublishEvent_NilEventBridge(t *testing.T) { @@ -2389,7 +2446,7 @@ func TestPublishEvent_NilEventBridge(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2406,7 +2463,7 @@ func TestPublishEvent_EmptyEventBusName(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventSuccess) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2429,7 +2486,7 @@ func TestIsExcluded_WeekendExclusion(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2469,7 +2526,7 @@ func TestStreamRouter_MultipleRecords(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1, record2}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2507,9 +2564,10 @@ func TestJobLog_UnexpectedPKFormat(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // Error is logged, HandleStreamEvent returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) } // --------------------------------------------------------------------------- @@ -2537,7 +2595,7 @@ func TestRerun_UnexpectedPKFormat(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2560,7 +2618,7 @@ func TestSensor_UnexpectedPKFormat(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) } @@ -2623,7 +2681,7 @@ func TestRerun_DriftLimitExceeded(t *testing.T) { record := makeRerunRequestWithReason("data-drift") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN — drift limit exceeded. @@ -2651,7 +2709,7 @@ func TestRerun_ManualLimitExceeded(t *testing.T) { record := makeRerunRequestWithReason("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN — manual limit exceeded. @@ -2681,7 +2739,7 @@ func TestRerun_DriftUnderLimit(t *testing.T) { record := makeRerunRequestWithReason("data-drift") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN should have started — under drift limit. @@ -2706,7 +2764,7 @@ func TestRerun_LateDataCountsAsDrift(t *testing.T) { record := makeRerunRequestWithReason("data-drift") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -2733,7 +2791,7 @@ func TestRerun_WritesRerunBeforeLockRelease(t *testing.T) { record := makeRerunRequestWithReason("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN should have started. @@ -2785,7 +2843,7 @@ func TestRerun_DeletesPostrunBaseline(t *testing.T) { record := makeRerunRequestWithReason("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN should have started. @@ -2820,7 +2878,7 @@ func TestStreamRouter_JobFail_PermanentUsesCodeRetries(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // MaxCodeRetries=0 → immediate FAILED_FINAL, no SFN started @@ -2854,7 +2912,7 @@ func TestStreamRouter_JobFail_TransientUsesMaxRetries(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // TRANSIENT uses MaxRetries=3, no reruns yet → should retry @@ -2883,7 +2941,7 @@ func TestStreamRouter_JobFail_EmptyCategoryUsesMaxRetries(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No category → uses MaxRetries=3, no reruns → should retry @@ -2979,9 +3037,9 @@ func TestPostRunSensor_Completed_DriftDetected(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) - // Seed baseline captured at completion time. + // Seed baseline captured at completion time (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) // Sensor arrives with different count → drift. @@ -2992,7 +3050,7 @@ func TestPostRunSensor_Completed_DriftDetected(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should publish POST_RUN_DRIFT event. @@ -3023,9 +3081,9 @@ func TestPostRunSensor_Completed_NoDrift_RulesPass(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) - // Baseline with same count as incoming sensor. + // Baseline with same count as incoming sensor (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(150), + "audit-result": map[string]interface{}{"sensor_count": float64(150)}, }) // Seed the actual sensor so EvaluateRules can find it. seedSensor(mock, "gold-revenue", "audit-result", map[string]interface{}{ @@ -3039,7 +3097,7 @@ func TestPostRunSensor_Completed_NoDrift_RulesPass(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should publish POST_RUN_PASSED event. @@ -3063,9 +3121,9 @@ func TestPostRunSensor_Running_InflightDrift(t *testing.T) { seedConfig(mock, cfg) seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusRunning) - // Baseline from a previous run. + // Baseline from a previous run (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{ @@ -3075,7 +3133,7 @@ func TestPostRunSensor_Running_InflightDrift(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Should publish informational POST_RUN_DRIFT_INFLIGHT event (no rerun). @@ -3113,7 +3171,7 @@ func TestPostRunSensor_FailedFinal_Skipped(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No post-run events should be published for FAILED_FINAL trigger. @@ -3142,7 +3200,7 @@ func TestPostRunSensor_NoTrigger_Skipped(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No events published when no trigger exists. @@ -3166,7 +3224,7 @@ func TestPostRunSensor_NoPostRunConfig_GoesToTrigger(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No error, just silently ignored. } @@ -3269,7 +3327,7 @@ func TestJobFailure_AtomicLockReset_Success(t *testing.T) { record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN must have started for the rerun. @@ -3303,7 +3361,7 @@ func TestJobFailure_LockResetFails_NoSFN(t *testing.T) { record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3326,7 +3384,7 @@ func TestRerunRequest_AtomicLockReset(t *testing.T) { record := makeRerunRequestRecordFull("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // SFN must have started. @@ -3353,7 +3411,7 @@ func TestRerunRequest_LockResetFails_PublishesInfraFailure(t *testing.T) { record := makeRerunRequestRecordFull("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN execution. @@ -3395,9 +3453,10 @@ func TestJobFailure_SFNStartFails_ReleasesLock(t *testing.T) { record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - // HandleStreamEvent swallows per-record errors — the handler returns nil. - err := lambda.HandleStreamEvent(context.Background(), d, event) + // Per-record error collected as batch failure. + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) // Trigger lock must be released after SFN failure (so next attempt can acquire it). assert.False(t, triggerLockExists(mock), @@ -3420,7 +3479,7 @@ func TestRerunRequest_SFNStartFails_ReleasesLock(t *testing.T) { record := makeRerunRequestRecordFull("manual") event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) assert.False(t, triggerLockExists(mock), @@ -3557,7 +3616,7 @@ func TestRerunRequest_CalendarExclusion(t *testing.T) { record := makeDefaultRerunRequestRecord() // schedule=stream, date=2026-03-01 event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3581,7 +3640,7 @@ func TestRerunRequest_CalendarExclusion_WritesJobEvent(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) mock.mu.Lock() @@ -3622,7 +3681,7 @@ func TestRerunRequest_WeekendExclusion(t *testing.T) { } event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3646,7 +3705,7 @@ func TestJobFailure_CalendarExclusion(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3673,7 +3732,7 @@ func TestJobFailure_CalendarExclusion_RetryLimitBeatsExclusion(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3697,7 +3756,7 @@ func TestPostRunDrift_CalendarExclusion(t *testing.T) { seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{ @@ -3707,7 +3766,7 @@ func TestPostRunDrift_CalendarExclusion(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) ebMock.mu.Lock() @@ -3747,7 +3806,7 @@ func TestPostRunDrift_NotExcluded_WritesRerun(t *testing.T) { seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted) seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{ - "sensor_count": float64(100), + "audit-result": map[string]interface{}{"sensor_count": float64(100)}, }) record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{ @@ -3757,7 +3816,7 @@ func TestPostRunDrift_NotExcluded_WritesRerun(t *testing.T) { }), }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) rerunKey := ddbItemKey(testControlTable, types.PipelinePK("gold-revenue"), types.RerunRequestSK("stream", "2026-03-01")) @@ -3781,7 +3840,7 @@ func TestSensorEvent_CalendarExclusion_PublishesEvent(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3855,7 +3914,7 @@ func TestHandleSensorEvent_DryRun_WouldTrigger(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // NO SFN execution must be started. @@ -3895,7 +3954,7 @@ func TestHandleSensorEvent_DryRun_LateData(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // NO SFN execution. @@ -3933,7 +3992,7 @@ func TestHandleSensorEvent_DryRun_SLAProjection_Met(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -3988,7 +4047,7 @@ func TestHandleSensorEvent_DryRun_SLAProjection_Breach(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4042,7 +4101,7 @@ func TestHandleSensorEvent_DryRun_ValidationNotReady(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No SFN. @@ -4093,7 +4152,7 @@ func TestHandleSensorEvent_DryRun_CapturesBaseline(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4135,7 +4194,7 @@ func TestHandleSensorEvent_DryRun_Completed_NoSLA(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4188,7 +4247,7 @@ func TestHandleSensorEvent_DryRun_Completed_WithSLA(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) sfnMock.mu.Lock() @@ -4243,9 +4302,9 @@ func TestDryRunPostRunSensor_DriftDetected(t *testing.T) { // Pre-seed DRY_RUN# marker (would-trigger already happened). seedDryRunMarker(mock, "gold-revenue", "stream", fixedTestDate, "2026-03-11T01:15:00Z") - // Pre-seed baseline with sensor_count=500. + // Pre-seed baseline with sensor_count=500 (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+fixedTestDate, map[string]interface{}{ - "sensor_count": float64(500), + "audit-result": map[string]interface{}{"sensor_count": float64(500)}, }) // Sensor arrives for post-run key with sensor_count=520 (drift detected). @@ -4257,7 +4316,7 @@ func TestDryRunPostRunSensor_DriftDetected(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // DRY_RUN_DRIFT event published. @@ -4297,9 +4356,9 @@ func TestDryRunPostRunSensor_NoDrift(t *testing.T) { // Pre-seed DRY_RUN# marker. seedDryRunMarker(mock, "gold-revenue", "stream", fixedTestDate, "2026-03-11T01:15:00Z") - // Baseline with sensor_count=500. + // Baseline with sensor_count=500 (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+fixedTestDate, map[string]interface{}{ - "sensor_count": float64(500), + "audit-result": map[string]interface{}{"sensor_count": float64(500)}, }) // Sensor arrives with same sensor_count=500 — no drift. @@ -4311,7 +4370,7 @@ func TestDryRunPostRunSensor_NoDrift(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No events published (no drift). @@ -4345,7 +4404,7 @@ func TestDryRunPostRunSensor_NoMarker(t *testing.T) { }) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // No events published (no marker means no trigger happened). @@ -4359,7 +4418,7 @@ func TestDryRunPostRunSensor_NoMarker(t *testing.T) { func TestRerun_DryRun_SkipsExecution(t *testing.T) { mock := newMockDDB() - d, sfnMock, _ := testDeps(mock) + d, sfnMock, ebMock := testDeps(mock) cfg := testDryRunConfig() seedConfig(mock, cfg) @@ -4371,23 +4430,185 @@ func TestRerun_DryRun_SkipsExecution(t *testing.T) { record := makeDefaultRerunRequestRecord() event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Dry-run pipeline must NOT start an SFN execution. sfnMock.mu.Lock() - defer sfnMock.mu.Unlock() assert.Empty(t, sfnMock.executions, "dry-run pipeline must not start SFN on rerun request") + sfnMock.mu.Unlock() - // No rerun records written (guard fires before any store side effects). + // No rerun records written. count, countErr := d.Store.CountRerunsBySource(context.Background(), "gold-revenue", "stream", "2026-03-01", []string{"manual"}) require.NoError(t, countErr) assert.Zero(t, count, "dry-run must not write rerun records") + + // Must publish DRY_RUN_WOULD_RERUN with circuitBreaker and budget fields. + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunWouldRerun), "expected DRY_RUN_WOULD_RERUN event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRerun) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Contains(t, detail.Detail, "circuitBreaker") + assert.Contains(t, detail.Detail, "budget") + } + } + } +} + +func TestRerun_DryRun_CalendarExcluded(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Schedule.Exclude = &types.ExclusionConfig{Dates: []string{"2026-03-01"}} + seedConfig(mock, cfg) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "excluded by calendar", detail.Detail["reason"]) + } + } + } +} + +func TestRerun_DryRun_LimitExceeded(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Job.MaxManualReruns = intPtr(0) + seedConfig(mock, cfg) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "limit exceeded", detail.Detail["reason"]) + } + } + } +} + +func TestRerun_DryRun_CircuitBreakerReject(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + seedConfig(mock, cfg) + + // Seed a successful job with a millis-epoch timestamp. + seedJobEvent(mock, "2000000000000", types.JobEventSuccess) + + // Seed sensors with timestamps OLDER than the job — data unchanged. + seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{ + "status": "ready", + "updatedAt": float64(1000000000000), + }) + + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "circuit breaker", detail.Detail["reason"]) + assert.Equal(t, "rejected", detail.Detail["circuitBreaker"]) + } + } + } +} + +func TestRerun_DryRun_NoJobHistory(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + seedConfig(mock, cfg) + + // No JOB# events seeded — circuit breaker should report "skipped". + record := makeDefaultRerunRequestRecord() + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunWouldRerun), "expected DRY_RUN_WOULD_RERUN event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRerun) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "skipped (no job history)", detail.Detail["circuitBreaker"]) + } + } + } } func TestJobFailure_DryRun_SkipsRerun(t *testing.T) { mock := newMockDDB() - d, sfnMock, _ := testDeps(mock) + d, sfnMock, ebMock := testDeps(mock) cfg := testDryRunConfig() cfg.Job.MaxRetries = 2 @@ -4398,16 +4619,120 @@ func TestJobFailure_DryRun_SkipsRerun(t *testing.T) { record := makeJobRecord("gold-revenue", types.JobEventFail) event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} - err := lambda.HandleStreamEvent(context.Background(), d, event) + _, err := lambda.HandleStreamEvent(context.Background(), d, event) require.NoError(t, err) // Dry-run pipeline must NOT start an SFN execution. sfnMock.mu.Lock() - defer sfnMock.mu.Unlock() assert.Empty(t, sfnMock.executions, "dry-run pipeline must not start SFN on job failure") + sfnMock.mu.Unlock() - // No rerun records written (guard fires before any store side effects). + // No rerun records written. count, countErr := d.Store.CountRerunsBySource(context.Background(), "gold-revenue", "stream", "2026-03-01", []string{"job-fail-retry"}) require.NoError(t, countErr) assert.Zero(t, count, "dry-run must not write rerun records on job failure") + + // Must publish DRY_RUN_WOULD_RETRY with retries and maxRetries fields. + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunWouldRetry), "expected DRY_RUN_WOULD_RETRY event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRetry) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Contains(t, detail.Detail, "retries") + assert.Contains(t, detail.Detail, "maxRetries") + } + } + } +} + +func TestJobFailure_DryRun_RetryExhausted(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Job.MaxRetries = 0 + seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") + + record := makeJobRecord("gold-revenue", types.JobEventFail) + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRetryExhausted), "expected DRY_RUN_RETRY_EXHAUSTED event") +} + +func TestJobFailure_DryRun_CalendarExcluded(t *testing.T) { + mock := newMockDDB() + d, sfnMock, ebMock := testDeps(mock) + + cfg := testDryRunConfig() + cfg.Job.MaxRetries = 2 + cfg.Schedule.Exclude = &types.ExclusionConfig{Dates: []string{"2026-03-01"}} + seedConfig(mock, cfg) + seedTriggerLock(mock, "gold-revenue", "2026-03-01") + + record := makeJobRecord("gold-revenue", types.JobEventFail) + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}} + + _, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + + sfnMock.mu.Lock() + assert.Empty(t, sfnMock.executions, "dry-run must not start SFN") + sfnMock.mu.Unlock() + + evtTypes := gatherEventDetailTypes(ebMock) + assert.Contains(t, evtTypes, string(types.EventDryRunRetryExhausted), "expected DRY_RUN_RETRY_EXHAUSTED event") + + ebMock.mu.Lock() + defer ebMock.mu.Unlock() + for _, input := range ebMock.events { + for _, e := range input.Entries { + if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRetryExhausted) { + var detail types.InterlockEvent + require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail)) + assert.Equal(t, "excluded by calendar", detail.Detail["reason"]) + } + } + } +} + +// --------------------------------------------------------------------------- +// BatchItemFailures: partial error reporting +// --------------------------------------------------------------------------- + +func TestStreamRouter_BatchItemFailures_PartialError(t *testing.T) { + mock := newMockDDB() + d, _, _ := testDeps(mock) + + // Build an event with one valid record and one with empty PK (will error). + validRecord := makeSensorRecord("gold-revenue", "upstream-complete", map[string]events.DynamoDBAttributeValue{ + "status": events.NewStringAttribute("ready"), + }) + + invalidRecord := events.DynamoDBEventRecord{ + EventID: "bad-record-123", + EventName: "INSERT", + Change: events.DynamoDBStreamRecord{ + Keys: map[string]events.DynamoDBAttributeValue{}, + }, + } + + event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{invalidRecord, validRecord}} + resp, err := lambda.HandleStreamEvent(context.Background(), d, event) + require.NoError(t, err) + require.Len(t, resp.BatchItemFailures, 1) + assert.Equal(t, "bad-record-123", resp.BatchItemFailures[0].ItemIdentifier) } diff --git a/internal/lambda/watchdog.go b/internal/lambda/watchdog.go index f6b65cf..4d0658c 100644 --- a/internal/lambda/watchdog.go +++ b/internal/lambda/watchdog.go @@ -1,18 +1,6 @@ package lambda -import ( - "context" - "errors" - "fmt" - "strconv" - "strings" - "time" - - schedulerTypes "github.com/aws/aws-sdk-go-v2/service/scheduler/types" - - "github.com/dwsmith1983/interlock/internal/validation" - "github.com/dwsmith1983/interlock/pkg/types" -) +import "context" // HandleWatchdog runs periodic health checks. It detects stale trigger // executions (Step Function timeouts) and missed cron schedules. Errors from @@ -44,1063 +32,3 @@ func HandleWatchdog(ctx context.Context, d *Deps) error { } return nil } - -// detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and -// publishes an SFN_TIMEOUT event for any that have exceeded their TTL or the -// staleTriggerThreshold. Stale triggers are moved to FAILED_FINAL status. -func detectStaleTriggers(ctx context.Context, d *Deps) error { - triggers, err := d.Store.ScanRunningTriggers(ctx) - if err != nil { - return fmt.Errorf("scan running triggers: %w", err) - } - - now := d.now() - for _, tr := range triggers { - if !isStaleTrigger(tr, now) { - continue - } - - pipelineID, schedule, date, err := parseTriggerRecord(tr) - if err != nil { - d.Logger.Warn("skipping unparseable trigger", "pk", tr.PK, "sk", tr.SK, "error", err) - continue - } - - // Dry-run pipelines should never have TRIGGER# rows, but guard - // against stale rows from pre-dry-run migrations or bugs. - if cfg, cfgErr := d.ConfigCache.Get(ctx, pipelineID); cfgErr == nil && cfg != nil && cfg.DryRun { - continue - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "actionHint": "step function exceeded TTL — check SFN execution history", - } - if tr.TTL > 0 { - alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339) - } - if err := publishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date, - fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) - } - - if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { - d.Logger.Error("failed to set trigger status to FAILED_FINAL", - "pipelineId", pipelineID, "schedule", schedule, "date", date, "error", err) - continue - } - - d.Logger.Info("detected stale trigger", - "pipelineId", pipelineID, - "schedule", schedule, - "date", date, - ) - } - return nil -} - -// isStaleTrigger returns true if the trigger's TTL has expired or if the TTL -// is zero and the trigger has been running longer than staleTriggerThreshold. -func isStaleTrigger(tr types.ControlRecord, now time.Time) bool { - if tr.TTL > 0 { - return now.Unix() > tr.TTL - } - // No TTL set — treat as stale if it has existed for longer than the threshold. - // Without a creation timestamp we can't be precise, so we conservatively - // consider it stale only when TTL is explicitly expired. - return false -} - -// parseTriggerRecord extracts pipeline ID, schedule, and date from a trigger -// ControlRecord's PK and SK. -// PK format: PIPELINE# -// SK format: TRIGGER## -func parseTriggerRecord(tr types.ControlRecord) (pipelineID, schedule, date string, err error) { - const pkPrefix = "PIPELINE#" - if !strings.HasPrefix(tr.PK, pkPrefix) { - return "", "", "", fmt.Errorf("unexpected PK format: %q", tr.PK) - } - pipelineID = tr.PK[len(pkPrefix):] - - const skPrefix = "TRIGGER#" - trimmed := strings.TrimPrefix(tr.SK, skPrefix) - if trimmed == tr.SK { - return "", "", "", fmt.Errorf("unexpected SK format: %q", tr.SK) - } - parts := strings.SplitN(trimmed, "#", 2) - if len(parts) != 2 { - return "", "", "", fmt.Errorf("invalid TRIGGER SK format: %q", tr.SK) - } - return pipelineID, parts[0], parts[1], nil -} - -// reconcileSensorTriggers re-evaluates trigger conditions for sensor-triggered -// pipelines. If a sensor meets the trigger condition but no trigger lock exists, -// the watchdog acquires the lock, starts the SFN, and publishes TRIGGER_RECOVERED. -// This self-heals missed triggers caused by silent completion-write failures. -func reconcileSensorTriggers(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - trigger := cfg.Schedule.Trigger - if trigger == nil || cfg.Schedule.Cron != "" { - continue - } - - // Dry-run pipelines are observation-only — skip reconciliation. - if cfg.DryRun { - continue - } - - if isExcluded(cfg, now) { - continue - } - - sensors, err := d.Store.GetAllSensors(ctx, id) - if err != nil { - d.Logger.Error("failed to get sensors for reconciliation", - "pipelineId", id, "error", err) - continue - } - - scheduleID := resolveScheduleID(cfg) - - for sensorKey, sensorData := range sensors { - if !strings.HasPrefix(sensorKey, trigger.Key) { - continue - } - - rule := types.ValidationRule{ - Key: trigger.Key, - Check: trigger.Check, - Field: trigger.Field, - Value: trigger.Value, - } - result := validation.EvaluateRule(rule, sensorData, now) - if !result.Passed { - continue - } - - date := ResolveExecutionDate(sensorData, now) - - found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) - if err != nil { - d.Logger.Error("trigger check failed during reconciliation", - "pipelineId", id, "date", date, "error", err) - continue - } - if found { - continue - } - - // Guard against re-triggering completed pipelines whose trigger - // record was deleted by DynamoDB TTL. Check the joblog for a - // terminal event before acquiring a new lock. - if isJobTerminal(ctx, d, id, scheduleID, date) { - continue - } - - acquired, err := d.Store.AcquireTriggerLock(ctx, id, scheduleID, date, ResolveTriggerLockTTL()) - if err != nil { - d.Logger.Error("lock acquisition failed during reconciliation", - "pipelineId", id, "date", date, "error", err) - continue - } - if !acquired { - continue - } - - if err := startSFN(ctx, d, cfg, id, scheduleID, date); err != nil { - if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil { - d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr) - } - d.Logger.Error("SFN start failed during reconciliation", - "pipelineId", id, "date", date, "error", err) - continue - } - - alertDetail := map[string]interface{}{ - "source": "reconciliation", - "actionHint": "watchdog recovered missed sensor trigger", - } - if err := publishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date, - fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date) - } - - d.Logger.Info("recovered missed trigger", - "pipelineId", id, - "schedule", scheduleID, - "date", date, - ) - } - } - return nil -} - -// lastCronFire returns the most recent expected fire time for a cron expression. -// Supports the minute-hour patterns used by this system: "MM * * * *" (hourly) -// and "MM HH * * *" (daily). Returns zero time for unsupported patterns. -func lastCronFire(cron string, now time.Time, loc *time.Location) time.Time { - fields := strings.Fields(cron) - if len(fields) < 5 { - return time.Time{} - } - minute, err := strconv.Atoi(fields[0]) - if err != nil { - return time.Time{} - } - localNow := now.In(loc) - - if fields[1] == "*" { - // Hourly: fires at :MM every hour. - candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), - localNow.Hour(), minute, 0, 0, loc) - if candidate.After(localNow) { - candidate = candidate.Add(-time.Hour) - } - return candidate - } - - hour, err := strconv.Atoi(fields[1]) - if err != nil { - return time.Time{} - } - // Daily: fires at HH:MM every day. - candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), - hour, minute, 0, 0, loc) - if candidate.After(localNow) { - candidate = candidate.Add(-24 * time.Hour) - } - return candidate -} - -// detectMissedSchedules checks all cron-scheduled pipelines to see if today's -// trigger is missing. If a pipeline should have started by now but has no -// TRIGGER# row, a SCHEDULE_MISSED event is published. -func detectMissedSchedules(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - today := now.Format("2006-01-02") - - for id, cfg := range configs { - // Only check cron-scheduled pipelines. - if cfg.Schedule.Cron == "" { - continue - } - - // Dry-run pipelines are observation-only — skip missed schedule detection. - if cfg.DryRun { - continue - } - - // Skip calendar-excluded days. - if isExcluded(cfg, now) { - continue - } - - // Only alert for schedules that should have fired after this Lambda - // started. Prevents retroactive alerts after fresh deploys. - if !d.StartedAt.IsZero() { - loc := resolveTimezone(cfg.Schedule.Timezone) - if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) { - continue - } - } - - // Resolve schedule ID for cron pipelines. - scheduleID := resolveScheduleID(cfg) - - // Check if any TRIGGER# row exists for today (covers both daily - // and per-hour trigger rows, e.g. "2026-03-04" and "2026-03-04T00"). - found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, today) - if err != nil { - d.Logger.Error("failed to check trigger for missed schedule", - "pipelineId", id, "error", err) - continue - } - if found { - continue - } - - // Check if we are past the expected start time. If the pipeline - // has a schedule time configured, only alert after that time. - if cfg.Schedule.Time != "" { - loc := resolveTimezone(cfg.Schedule.Timezone) - localNow := now.In(loc) - expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc) - if err == nil && localNow.Before(expectedStart) { - continue // not yet past expected start time - } - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "cron": cfg.Schedule.Cron, - "actionHint": fmt.Sprintf("cron %s expected to fire — no trigger found", cfg.Schedule.Cron), - } - if cfg.Schedule.Time != "" { - alertDetail["expectedTime"] = cfg.Schedule.Time - } - if err := publishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today, - fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil { - d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) - } - - d.Logger.Info("detected missed schedule", - "pipelineId", id, - "schedule", scheduleID, - "date", today, - ) - } - return nil -} - -// detectMissedInclusionSchedules checks pipelines with inclusion calendar config -// for missed schedules on irregular dates. For each pipeline with an Include -// config, it finds all past inclusion dates (capped at maxInclusionLookback) -// and verifies that a trigger exists for each. If no trigger is found and no -// dedup marker exists, an IRREGULAR_SCHEDULE_MISSED event is published. -func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 { - continue - } - - // Dry-run pipelines are observation-only — skip inclusion schedule detection. - if cfg.DryRun { - continue - } - - // Skip calendar-excluded days. - if isExcluded(cfg, now) { - continue - } - - pastDates := PastInclusionDates(cfg.Schedule.Include.Dates, now) - if len(pastDates) == 0 { - continue - } - - scheduleID := resolveScheduleID(cfg) - - // Resolve today in the pipeline's timezone so the grace-period - // guard fires correctly when UTC date != pipeline-local date. - tzLoc := resolveTimezone(cfg.Schedule.Timezone) - today := now.In(tzLoc).Format("2006-01-02") - - for _, date := range pastDates { - // If the inclusion date is today and the pipeline has a - // Schedule.Time, only alert after that time has passed. - // This mirrors the same check in detectMissedSchedules for - // cron pipelines to avoid false-positive alerts before the - // expected start time. Past dates are not gated because - // their Schedule.Time has necessarily already elapsed. - if cfg.Schedule.Time != "" && date == today { - localNow := now.In(tzLoc) - expectedStart, err := time.ParseInLocation("2006-01-02 15:04", date+" "+cfg.Schedule.Time, tzLoc) - if err == nil && localNow.Before(expectedStart) { - continue // not yet past expected start time - } - } - - // Check if a trigger exists for this inclusion date. - found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) - if err != nil { - d.Logger.Error("failed to check trigger for inclusion schedule", - "pipelineId", id, "date", date, "error", err) - continue - } - if found { - continue - } - - // Check dedup marker to avoid re-alerting on subsequent watchdog runs. - dedupKey := "irregular-missed-check#" + date - dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) - if err != nil { - d.Logger.Error("dedup marker lookup failed for inclusion schedule", - "pipelineId", id, "date", date, "error", err) - continue - } - if dedupData != nil { - continue - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date), - } - if err := publishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date, - fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date) - } - - // Write dedup marker. - if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ - "alerted": "true", - }); err != nil { - d.Logger.Warn("failed to write inclusion dedup marker", "error", err, "pipeline", id, "date", date) - } - - d.Logger.Info("detected missed inclusion schedule", - "pipelineId", id, - "schedule", scheduleID, - "date", date, - ) - } - } - return nil -} - -// scheduleSLAAlerts proactively creates EventBridge Scheduler entries for all -// pipelines with SLA configs. This ensures warnings/breaches fire even when -// pipelines never trigger (data never arrives, sensor fails, etc.). -// Idempotency: deterministic scheduler names; ConflictException = already exists. -func scheduleSLAAlerts(ctx context.Context, d *Deps) error { - if d.Scheduler == nil { - return nil - } - - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - if cfg.SLA == nil { - continue - } - - // Dry-run pipelines are observation-only — skip SLA scheduling. - if cfg.DryRun { - continue - } - - if isExcluded(cfg, now) { - continue - } - - scheduleID := resolveScheduleID(cfg) - date := resolveWatchdogSLADate(cfg, now) - - // Sensor-triggered daily pipelines run T+1: data for today completes - // tomorrow, so the SLA deadline is relative to tomorrow's date. - // Only slaDate is shifted; the original date is kept for schedule - // naming, trigger lookup, and fire-alert payload so cancellation - // stays consistent with the SFN's view of the pipeline. - slaDate := date - if cfg.Schedule.Cron == "" && !strings.HasPrefix(cfg.SLA.Deadline, ":") { - t, err := time.Parse("2006-01-02", date) - if err == nil { - slaDate = t.AddDate(0, 0, 1).Format("2006-01-02") - } - } - - // Skip if pipeline already completed or permanently failed for this date. - tr, err := d.Store.GetTrigger(ctx, id, scheduleID, date) - switch { - case err != nil: - d.Logger.Warn("trigger lookup failed in SLA scheduling", "pipelineId", id, "error", err) - continue - case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal): - continue - case isJobTerminal(ctx, d, id, scheduleID, date): - continue - } - - calc, err := handleSLACalculate(SLAMonitorInput{ - Mode: "calculate", - PipelineID: id, - ScheduleID: scheduleID, - Date: slaDate, - Deadline: cfg.SLA.Deadline, - ExpectedDuration: cfg.SLA.ExpectedDuration, - Timezone: cfg.SLA.Timezone, - }, now) - if err != nil { - d.Logger.Error("SLA calculate failed", "pipelineId", id, "error", err) - continue - } - - breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) - if breachAt.IsZero() || breachAt.After(now) { - // SLA breach is in the future — create schedules. - var scheduleErr bool - for _, alert := range []struct { - suffix string - alertType string - timestamp string - }{ - {"warning", "SLA_WARNING", calc.WarningAt}, - {"breach", "SLA_BREACH", calc.BreachAt}, - } { - name := slaScheduleName(id, scheduleID, date, alert.suffix) - payload := SLAMonitorInput{ - Mode: "fire-alert", - PipelineID: id, - ScheduleID: scheduleID, - Date: date, - AlertType: alert.alertType, - } - if alert.alertType == "SLA_WARNING" { - payload.BreachAt = calc.BreachAt - } - if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil { - var conflict *schedulerTypes.ConflictException - if errors.As(err, &conflict) { - continue - } - d.Logger.Error("create SLA schedule failed", - "pipelineId", id, "suffix", alert.suffix, "error", err) - scheduleErr = true - } - } - - if !scheduleErr { - d.Logger.Info("proactive SLA schedules ensured", - "pipelineId", id, - "date", date, - "warningAt", calc.WarningAt, - "breachAt", calc.BreachAt, - ) - } - } - } - return nil -} - -// checkTriggerDeadlines evaluates trigger deadlines independently of SLA -// configuration. Pipelines with a Trigger.Deadline but no SLA config are -// checked here. For each pipeline, if the trigger deadline has passed and -// no trigger exists, the sensor trigger window is closed. -func checkTriggerDeadlines(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - - for id, cfg := range configs { - if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" { - continue - } - - // Dry-run pipelines are observation-only — skip trigger deadline checks. - if cfg.DryRun { - continue - } - - if isExcluded(cfg, now) { - continue - } - - scheduleID := resolveScheduleID(cfg) - triggerDate := resolveTriggerDeadlineDate(cfg, now) - - triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate) - if err != nil { - d.Logger.Warn("trigger lookup failed in deadline check", "pipelineId", id, "error", err) - continue - } - if triggerRec != nil { - continue - } - - if isJobTerminal(ctx, d, id, scheduleID, triggerDate) { - continue - } - - closeSensorTriggerWindow(ctx, d, id, scheduleID, triggerDate, cfg, now) - } - return nil -} - -// resolveWatchdogSLADate determines the execution date for SLA scheduling. -// - Hourly pipelines (relative deadline like ":30"): previous hour composite -// date, e.g. "2026-03-05T13" when the clock is 14:xx. -// - Daily pipelines (absolute deadline like "02:00"): today's date, -// so handleSLACalculate rolls the deadline forward to the next occurrence. -func resolveWatchdogSLADate(cfg *types.PipelineConfig, now time.Time) string { - if strings.HasPrefix(cfg.SLA.Deadline, ":") { - prev := now.Add(-time.Hour) - return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) - } - return now.Format("2006-01-02") -} - -// resolveTriggerDeadlineDate determines the execution date for trigger -// deadline evaluation. Uses the trigger deadline format (not SLA deadline) -// to decide between hourly composite date and daily date. -func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string { - if strings.HasPrefix(cfg.Schedule.Trigger.Deadline, ":") { - prev := now.Add(-time.Hour) - return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) - } - return now.Format("2006-01-02") -} - -// resolveTriggerDeadlineTime computes the absolute time when the trigger -// window closes for the given deadline string and execution date. -// -// For relative (hourly) deadlines like ":45" with composite date "2026-03-09T13": -// - Data for hour 13 is processed in hour 14 -// - The deadline resolves to 2026-03-09T14:45:00 in the configured timezone -// -// For absolute (daily) deadlines like "09:00" with date "2026-03-09": -// - The deadline resolves to 2026-03-09T09:00:00 in the configured timezone -// -// Unlike handleSLACalculate, this does NOT roll forward when the time is past. -// Returns zero time on parse errors. -func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time { - loc := resolveTimezone(timezone) - - if strings.HasPrefix(deadline, ":") { - // Relative (hourly): ":MM" — deadline is in the NEXT hour after the - // composite date's hour, since data for hour H is processed in hour H+1. - minute, err := strconv.Atoi(strings.TrimPrefix(deadline, ":")) - if err != nil { - return time.Time{} - } - // Parse composite date "YYYY-MM-DDThh". - if len(date) < 13 || date[10] != 'T' { - return time.Time{} - } - t, err := time.ParseInLocation("2006-01-02T15", date, loc) - if err != nil { - return time.Time{} - } - // Add 1 hour for the processing window, then set the minute. - return time.Date(t.Year(), t.Month(), t.Day(), t.Hour()+1, minute, 0, 0, loc) - } - - // Absolute (daily): "HH:MM". - parts := strings.SplitN(deadline, ":", 2) - if len(parts) != 2 { - return time.Time{} - } - hour, err := strconv.Atoi(parts[0]) - if err != nil { - return time.Time{} - } - minute, err := strconv.Atoi(parts[1]) - if err != nil { - return time.Time{} - } - t, err := time.ParseInLocation("2006-01-02", date, loc) - if err != nil { - return time.Time{} - } - return time.Date(t.Year(), t.Month(), t.Day(), hour, minute, 0, 0, loc) -} - -// closeSensorTriggerWindow checks whether the trigger deadline has passed for -// a sensor-triggered pipeline that never started. If expired, it writes a -// FAILED_FINAL trigger record (blocking future auto-triggers) and publishes -// a SENSOR_DEADLINE_EXPIRED event. A human can still restart via RERUN_REQUEST. -func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, cfg *types.PipelineConfig, now time.Time) { - // Compute the absolute trigger deadline time directly — we do NOT use - // handleSLACalculate here because it rolls daily deadlines forward 24h - // when past, which defeats the purpose of checking for expiry. - tz := "" - if cfg.SLA != nil { - tz = cfg.SLA.Timezone - } - triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz) - if triggerDeadline.IsZero() || triggerDeadline.After(now) { - return - } - - // Use conditional put to avoid overwriting a trigger that was acquired - // between the GetTrigger read and this write (TOCTOU protection). - created, err := d.Store.CreateTriggerIfAbsent(ctx, pipelineID, scheduleID, date, types.TriggerStatusFailedFinal) - if err != nil { - d.Logger.Error("failed to write FAILED_FINAL for expired trigger deadline", - "pipelineId", pipelineID, "schedule", scheduleID, "date", date, "error", err) - return - } - if !created { - // Trigger row appeared since the read — pipeline started, don't interfere. - d.Logger.Info("trigger appeared during deadline check, skipping window close", - "pipelineId", pipelineID, "schedule", scheduleID, "date", date) - return - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "triggerDeadline": cfg.Schedule.Trigger.Deadline, - "actionHint": "auto-trigger window closed — use RERUN_REQUEST to restart", - } - if err := publishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date, - fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil { - d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID) - } - - d.Logger.Info("sensor trigger window closed", - "pipelineId", pipelineID, - "schedule", scheduleID, - "date", date, - "triggerDeadline", cfg.Schedule.Trigger.Deadline, - ) -} - -// defaultSensorTimeout is the default grace period for post-run sensors to -// arrive after a pipeline completes. If no SensorTimeout is configured in -// PostRunConfig, this value is used. -const defaultSensorTimeout = 2 * time.Hour - -// detectMissingPostRunSensors checks pipelines with PostRun config for missing -// post-run sensor data. If a pipeline completed (COMPLETED trigger + baseline -// exists) but no post-run sensor matching a rule key has been updated since -// completion, and the SensorTimeout grace period has elapsed, a -// POST_RUN_SENSOR_MISSING event is published. -func detectMissingPostRunSensors(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - today := now.Format("2006-01-02") - - for id, cfg := range configs { - if cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 { - continue - } - - // Dry-run pipelines are observation-only — skip post-run sensor checks. - if cfg.DryRun { - continue - } - - scheduleID := resolveScheduleID(cfg) - - // Only check pipelines with a COMPLETED trigger for today. - tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today) - if err != nil { - d.Logger.Error("trigger lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - if tr == nil || tr.Status != types.TriggerStatusCompleted { - continue - } - - // Baseline must exist — it signals that capturePostRunBaseline ran - // at completion time. - baselineKey := "postrun-baseline#" + today - baseline, err := d.Store.GetSensorData(ctx, id, baselineKey) - if err != nil { - d.Logger.Error("baseline lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - if baseline == nil { - continue - } - - // Dedup: skip if we already published an alert for this date. - dedupKey := "postrun-check#" + today - dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) - if err != nil { - d.Logger.Error("dedup marker lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - if dedupData != nil { - continue - } - - // Determine the completion timestamp from the latest success job event. - completionTime, err := resolveCompletionTime(ctx, d, id, scheduleID, today) - if err != nil { - d.Logger.Error("completion time resolution failed", - "pipelineId", id, "error", err) - continue - } - if completionTime.IsZero() { - continue - } - - // Parse SensorTimeout from config (default 2h). - timeout := parseSensorTimeout(cfg.PostRun.SensorTimeout) - - // Check if the timeout has elapsed since completion. - if now.Before(completionTime.Add(timeout)) { - continue - } - - // Check if any post-run rule sensor has been updated since completion. - sensors, err := d.Store.GetAllSensors(ctx, id) - if err != nil { - d.Logger.Error("sensor lookup failed in post-run sensor check", - "pipelineId", id, "error", err) - continue - } - - if hasPostRunSensorUpdate(cfg.PostRun.Rules, sensors, completionTime) { - continue - } - - // No post-run sensor has arrived within the grace period — publish event. - ruleKeys := make([]string, 0, len(cfg.PostRun.Rules)) - for _, r := range cfg.PostRun.Rules { - ruleKeys = append(ruleKeys, r.Key) - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "sensorTimeout": cfg.PostRun.SensorTimeout, - "ruleKeys": strings.Join(ruleKeys, ", "), - "actionHint": "post-run sensor data has not arrived within the expected timeout", - } - if err := publishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today, - fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil { - d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) - } - - // Write dedup marker to avoid re-alerting on subsequent watchdog runs. - if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ - "alerted": "true", - }); err != nil { - d.Logger.Warn("failed to write post-run dedup marker", "error", err, "pipeline", id, "date", today) - } - - d.Logger.Info("detected missing post-run sensor", - "pipelineId", id, - "schedule", scheduleID, - "date", today, - ) - } - return nil -} - -// resolveCompletionTime extracts the completion timestamp from the latest -// success job event for the given pipeline/schedule/date. The job event SK -// has the format JOB### where timestamp is -// milliseconds since epoch. -func resolveCompletionTime(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) (time.Time, error) { - rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date) - if err != nil { - return time.Time{}, fmt.Errorf("get latest job event: %w", err) - } - if rec == nil { - return time.Time{}, nil - } - if rec.Event != types.JobEventSuccess { - return time.Time{}, nil - } - - // Extract timestamp from SK: JOB### - parts := strings.Split(rec.SK, "#") - if len(parts) < 4 { - return time.Time{}, fmt.Errorf("unexpected job SK format: %q", rec.SK) - } - tsMillis, err := strconv.ParseInt(parts[len(parts)-1], 10, 64) - if err != nil { - return time.Time{}, fmt.Errorf("parse job timestamp %q: %w", parts[len(parts)-1], err) - } - return time.UnixMilli(tsMillis), nil -} - -// parseSensorTimeout parses a duration string from PostRunConfig.SensorTimeout. -// Returns defaultSensorTimeout (2h) if the string is empty or unparseable. -func parseSensorTimeout(s string) time.Duration { - if s == "" { - return defaultSensorTimeout - } - d, err := time.ParseDuration(s) - if err != nil { - return defaultSensorTimeout - } - return d -} - -// hasPostRunSensorUpdate checks whether any sensor matching a PostRun rule key -// has an updatedAt timestamp newer than the given completion time. -func hasPostRunSensorUpdate(rules []types.ValidationRule, sensors map[string]map[string]interface{}, completionTime time.Time) bool { - completionMillis := completionTime.UnixMilli() - - for _, rule := range rules { - data, ok := sensors[rule.Key] - if !ok { - continue - } - - updatedAt, ok := data["updatedAt"] - if !ok { - continue - } - - var ts int64 - switch v := updatedAt.(type) { - case float64: - ts = int64(v) - case int64: - ts = v - case string: - ts, _ = strconv.ParseInt(v, 10, 64) - default: - continue - } - - if ts > completionMillis { - return true - } - } - return false -} - -// detectRelativeSLABreaches checks pipelines with MaxDuration SLA config for -// breaches. This is a defense-in-depth fallback: if the EventBridge Scheduler -// fails to fire the relative SLA breach alert, the watchdog catches it. -// -// Both today and yesterday are checked because stream_router writes the -// first-sensor-arrival key using ResolveExecutionDate(), which for T+1 -// sensor-triggered pipelines produces yesterday's date. Checking both dates -// covers the cross-day boundary. -func detectRelativeSLABreaches(ctx context.Context, d *Deps) error { - configs, err := d.ConfigCache.GetAll(ctx) - if err != nil { - return fmt.Errorf("load configs: %w", err) - } - - now := d.now() - datesToCheck := []string{ - now.Format("2006-01-02"), - now.AddDate(0, 0, -1).Format("2006-01-02"), - } - - for id, cfg := range configs { - if cfg.SLA == nil || cfg.SLA.MaxDuration == "" { - continue - } - - // Dry-run pipelines are observation-only — skip relative SLA checks. - if cfg.DryRun { - continue - } - - maxDur, err := time.ParseDuration(cfg.SLA.MaxDuration) - if err != nil { - d.Logger.Warn("invalid maxDuration in SLA config", - "pipelineId", id, "maxDuration", cfg.SLA.MaxDuration, "error", err) - continue - } - - scheduleID := resolveScheduleID(cfg) - - for _, checkDate := range datesToCheck { - checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now) - } - } - return nil -} - -// checkRelativeSLAForDate checks a single date for a relative SLA breach on -// the given pipeline. It looks up the first-sensor-arrival marker, verifies -// the breach window has elapsed, and publishes an alert if needed. -func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types.PipelineConfig, scheduleID, checkDate string, maxDur time.Duration, now time.Time) { - arrivalKey := "first-sensor-arrival#" + checkDate - arrivalData, err := d.Store.GetSensorData(ctx, id, arrivalKey) - if err != nil { - d.Logger.Error("first-sensor-arrival lookup failed", - "pipelineId", id, "date", checkDate, "error", err) - return - } - if arrivalData == nil { - return - } - - arrivedAtStr, ok := arrivalData["arrivedAt"].(string) - if !ok || arrivedAtStr == "" { - return - } - arrivedAt, err := time.Parse(time.RFC3339, arrivedAtStr) - if err != nil { - d.Logger.Warn("invalid arrivedAt in first-sensor-arrival", - "pipelineId", id, "arrivedAt", arrivedAtStr, "error", err) - return - } - - // Check if the relative SLA has been breached. - breachAt := arrivedAt.Add(maxDur) - if now.Before(breachAt) { - return - } - - // Skip if pipeline already completed or permanently failed. - tr, err := d.Store.GetTrigger(ctx, id, scheduleID, checkDate) - if err != nil { - d.Logger.Warn("trigger lookup failed in relative SLA check", - "pipelineId", id, "date", checkDate, "error", err) - return - } - if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) { - return - } - if isJobTerminal(ctx, d, id, scheduleID, checkDate) { - return - } - - // Check dedup marker to avoid re-alerting on subsequent watchdog runs. - // The dedup key includes checkDate to avoid cross-date collisions. - dedupKey := "relative-sla-breach-check#" + checkDate - dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) - if err != nil { - d.Logger.Error("dedup marker lookup failed for relative SLA breach", - "pipelineId", id, "date", checkDate, "error", err) - return - } - if dedupData != nil { - return - } - - alertDetail := map[string]interface{}{ - "source": "watchdog", - "maxDuration": cfg.SLA.MaxDuration, - "sensorArrivalAt": arrivedAtStr, - "breachAt": breachAt.UTC().Format(time.RFC3339), - "actionHint": "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival", - } - if err := publishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate, - fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil { - d.Logger.Warn("failed to publish relative SLA breach event", - "error", err, "pipeline", id, "date", checkDate) - } - - // Write dedup marker. - if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ - "alerted": "true", - }); err != nil { - d.Logger.Warn("failed to write relative SLA breach dedup marker", - "error", err, "pipeline", id, "date", checkDate) - } - - d.Logger.Info("detected relative SLA breach", - "pipelineId", id, - "schedule", scheduleID, - "date", checkDate, - "sensorArrivalAt", arrivedAtStr, - "breachAt", breachAt.UTC().Format(time.RFC3339), - ) -} diff --git a/internal/lambda/watchdog_missed.go b/internal/lambda/watchdog_missed.go new file mode 100644 index 0000000..cd94130 --- /dev/null +++ b/internal/lambda/watchdog_missed.go @@ -0,0 +1,237 @@ +package lambda + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// lastCronFire returns the most recent expected fire time for a cron expression. +// Supports the minute-hour patterns used by this system: "MM * * * *" (hourly) +// and "MM HH * * *" (daily). Returns zero time for unsupported patterns. +func lastCronFire(cron string, now time.Time, loc *time.Location) time.Time { + fields := strings.Fields(cron) + if len(fields) < 5 { + return time.Time{} + } + minute, err := strconv.Atoi(fields[0]) + if err != nil { + return time.Time{} + } + localNow := now.In(loc) + + if fields[1] == "*" { + // Hourly: fires at :MM every hour. + candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), + localNow.Hour(), minute, 0, 0, loc) + if candidate.After(localNow) { + candidate = candidate.Add(-time.Hour) + } + return candidate + } + + hour, err := strconv.Atoi(fields[1]) + if err != nil { + return time.Time{} + } + // Daily: fires at HH:MM every day. + candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), + hour, minute, 0, 0, loc) + if candidate.After(localNow) { + candidate = candidate.Add(-24 * time.Hour) + } + return candidate +} + +// detectMissedSchedules checks all cron-scheduled pipelines to see if today's +// trigger is missing. If a pipeline should have started by now but has no +// TRIGGER# row, a SCHEDULE_MISSED event is published. +func detectMissedSchedules(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + today := now.Format("2006-01-02") + + for id, cfg := range configs { + // Only check cron-scheduled pipelines. + if cfg.Schedule.Cron == "" { + continue + } + + // Dry-run pipelines are observation-only — skip missed schedule detection. + if cfg.DryRun { + continue + } + + // Skip calendar-excluded days. + if isExcluded(cfg, now) { + continue + } + + // Only alert for schedules that should have fired after this Lambda + // started. Prevents retroactive alerts after fresh deploys. + if !d.StartedAt.IsZero() { + loc := resolveTimezone(cfg.Schedule.Timezone) + if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) { + continue + } + } + + // Resolve schedule ID for cron pipelines. + scheduleID := resolveScheduleID(cfg) + + // Check if any TRIGGER# row exists for today (covers both daily + // and per-hour trigger rows, e.g. "2026-03-04" and "2026-03-04T00"). + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, today) + if err != nil { + d.Logger.Error("failed to check trigger for missed schedule", + "pipelineId", id, "error", err) + continue + } + if found { + continue + } + + // Check if we are past the expected start time. If the pipeline + // has a schedule time configured, only alert after that time. + if cfg.Schedule.Time != "" { + loc := resolveTimezone(cfg.Schedule.Timezone) + localNow := now.In(loc) + expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc) + if err == nil && localNow.Before(expectedStart) { + continue // not yet past expected start time + } + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "cron": cfg.Schedule.Cron, + "actionHint": fmt.Sprintf("cron %s expected to fire — no trigger found", cfg.Schedule.Cron), + } + if cfg.Schedule.Time != "" { + alertDetail["expectedTime"] = cfg.Schedule.Time + } + if err := publishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today, + fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil { + d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) + } + + d.Logger.Info("detected missed schedule", + "pipelineId", id, + "schedule", scheduleID, + "date", today, + ) + } + return nil +} + +// detectMissedInclusionSchedules checks pipelines with inclusion calendar config +// for missed schedules on irregular dates. For each pipeline with an Include +// config, it finds all past inclusion dates (capped at maxInclusionLookback) +// and verifies that a trigger exists for each. If no trigger is found and no +// dedup marker exists, an IRREGULAR_SCHEDULE_MISSED event is published. +func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 { + continue + } + + // Dry-run pipelines are observation-only — skip inclusion schedule detection. + if cfg.DryRun { + continue + } + + // Skip calendar-excluded days. + if isExcluded(cfg, now) { + continue + } + + pastDates := PastInclusionDates(cfg.Schedule.Include.Dates, now) + if len(pastDates) == 0 { + continue + } + + scheduleID := resolveScheduleID(cfg) + + // Resolve today in the pipeline's timezone so the grace-period + // guard fires correctly when UTC date != pipeline-local date. + tzLoc := resolveTimezone(cfg.Schedule.Timezone) + today := now.In(tzLoc).Format("2006-01-02") + + for _, date := range pastDates { + // If the inclusion date is today and the pipeline has a + // Schedule.Time, only alert after that time has passed. + // This mirrors the same check in detectMissedSchedules for + // cron pipelines to avoid false-positive alerts before the + // expected start time. Past dates are not gated because + // their Schedule.Time has necessarily already elapsed. + if cfg.Schedule.Time != "" && date == today { + localNow := now.In(tzLoc) + expectedStart, err := time.ParseInLocation("2006-01-02 15:04", date+" "+cfg.Schedule.Time, tzLoc) + if err == nil && localNow.Before(expectedStart) { + continue // not yet past expected start time + } + } + + // Check if a trigger exists for this inclusion date. + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) + if err != nil { + d.Logger.Error("failed to check trigger for inclusion schedule", + "pipelineId", id, "date", date, "error", err) + continue + } + if found { + continue + } + + // Check dedup marker to avoid re-alerting on subsequent watchdog runs. + dedupKey := "irregular-missed-check#" + date + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed for inclusion schedule", + "pipelineId", id, "date", date, "error", err) + continue + } + if dedupData != nil { + continue + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date), + } + if err := publishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date, + fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date) + } + + // Write dedup marker. + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write inclusion dedup marker", "error", err, "pipeline", id, "date", date) + } + + d.Logger.Info("detected missed inclusion schedule", + "pipelineId", id, + "schedule", scheduleID, + "date", date, + ) + } + } + return nil +} diff --git a/internal/lambda/watchdog_postrun.go b/internal/lambda/watchdog_postrun.go new file mode 100644 index 0000000..677ac10 --- /dev/null +++ b/internal/lambda/watchdog_postrun.go @@ -0,0 +1,353 @@ +package lambda + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// defaultSensorTimeout is the default grace period for post-run sensors to +// arrive after a pipeline completes. If no SensorTimeout is configured in +// PostRunConfig, this value is used. +const defaultSensorTimeout = 2 * time.Hour + +// detectMissingPostRunSensors checks pipelines with PostRun config for missing +// post-run sensor data. If a pipeline completed (COMPLETED trigger + baseline +// exists) but no post-run sensor matching a rule key has been updated since +// completion, and the SensorTimeout grace period has elapsed, a +// POST_RUN_SENSOR_MISSING event is published. +func detectMissingPostRunSensors(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + today := now.Format("2006-01-02") + + for id, cfg := range configs { + if cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 { + continue + } + + // Dry-run pipelines are observation-only — skip post-run sensor checks. + if cfg.DryRun { + continue + } + + scheduleID := resolveScheduleID(cfg) + + // Only check pipelines with a COMPLETED trigger for today. + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today) + if err != nil { + d.Logger.Error("trigger lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if tr == nil || tr.Status != types.TriggerStatusCompleted { + continue + } + + // Baseline must exist — it signals that capturePostRunBaseline ran + // at completion time. + baselineKey := "postrun-baseline#" + today + baseline, err := d.Store.GetSensorData(ctx, id, baselineKey) + if err != nil { + d.Logger.Error("baseline lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if baseline == nil { + continue + } + + // Dedup: skip if we already published an alert for this date. + dedupKey := "postrun-check#" + today + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if dedupData != nil { + continue + } + + // Determine the completion timestamp from the latest success job event. + completionTime, err := resolveCompletionTime(ctx, d, id, scheduleID, today) + if err != nil { + d.Logger.Error("completion time resolution failed", + "pipelineId", id, "error", err) + continue + } + if completionTime.IsZero() { + continue + } + + // Parse SensorTimeout from config (default 2h). + timeout := parseSensorTimeout(cfg.PostRun.SensorTimeout) + + // Check if the timeout has elapsed since completion. + if now.Before(completionTime.Add(timeout)) { + continue + } + + // Check if any post-run rule sensor has been updated since completion. + sensors, err := d.Store.GetAllSensors(ctx, id) + if err != nil { + d.Logger.Error("sensor lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + + if hasPostRunSensorUpdate(cfg.PostRun.Rules, sensors, completionTime) { + continue + } + + // No post-run sensor has arrived within the grace period — publish event. + ruleKeys := make([]string, 0, len(cfg.PostRun.Rules)) + for _, r := range cfg.PostRun.Rules { + ruleKeys = append(ruleKeys, r.Key) + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "sensorTimeout": cfg.PostRun.SensorTimeout, + "ruleKeys": strings.Join(ruleKeys, ", "), + "actionHint": "post-run sensor data has not arrived within the expected timeout", + } + if err := publishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today, + fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil { + d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) + } + + // Write dedup marker to avoid re-alerting on subsequent watchdog runs. + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write post-run dedup marker", "error", err, "pipeline", id, "date", today) + } + + d.Logger.Info("detected missing post-run sensor", + "pipelineId", id, + "schedule", scheduleID, + "date", today, + ) + } + return nil +} + +// resolveCompletionTime extracts the completion timestamp from the latest +// success job event for the given pipeline/schedule/date. The job event SK +// has the format JOB### where timestamp is +// milliseconds since epoch. +func resolveCompletionTime(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) (time.Time, error) { + rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date) + if err != nil { + return time.Time{}, fmt.Errorf("get latest job event: %w", err) + } + if rec == nil { + return time.Time{}, nil + } + if rec.Event != types.JobEventSuccess { + return time.Time{}, nil + } + + // Extract timestamp from SK: JOB### + parts := strings.Split(rec.SK, "#") + if len(parts) < 4 { + return time.Time{}, fmt.Errorf("unexpected job SK format: %q", rec.SK) + } + tsMillis, err := strconv.ParseInt(parts[len(parts)-1], 10, 64) + if err != nil { + return time.Time{}, fmt.Errorf("parse job timestamp %q: %w", parts[len(parts)-1], err) + } + return time.UnixMilli(tsMillis), nil +} + +// parseSensorTimeout parses a duration string from PostRunConfig.SensorTimeout. +// Returns defaultSensorTimeout (2h) if the string is empty or unparseable. +func parseSensorTimeout(s string) time.Duration { + if s == "" { + return defaultSensorTimeout + } + d, err := time.ParseDuration(s) + if err != nil { + return defaultSensorTimeout + } + return d +} + +// hasPostRunSensorUpdate checks whether any sensor matching a PostRun rule key +// has an updatedAt timestamp newer than the given completion time. +func hasPostRunSensorUpdate(rules []types.ValidationRule, sensors map[string]map[string]interface{}, completionTime time.Time) bool { + completionMillis := completionTime.UnixMilli() + + for _, rule := range rules { + data, ok := sensors[rule.Key] + if !ok { + continue + } + + updatedAt, ok := data["updatedAt"] + if !ok { + continue + } + + var ts int64 + switch v := updatedAt.(type) { + case float64: + ts = int64(v) + case int64: + ts = v + case string: + ts, _ = strconv.ParseInt(v, 10, 64) + default: + continue + } + + if ts > completionMillis { + return true + } + } + return false +} + +// detectRelativeSLABreaches checks pipelines with MaxDuration SLA config for +// breaches. This is a defense-in-depth fallback: if the EventBridge Scheduler +// fails to fire the relative SLA breach alert, the watchdog catches it. +// +// Both today and yesterday are checked because stream_router writes the +// first-sensor-arrival key using ResolveExecutionDate(), which for T+1 +// sensor-triggered pipelines produces yesterday's date. Checking both dates +// covers the cross-day boundary. +func detectRelativeSLABreaches(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + datesToCheck := []string{ + now.Format("2006-01-02"), + now.AddDate(0, 0, -1).Format("2006-01-02"), + } + + for id, cfg := range configs { + if cfg.SLA == nil || cfg.SLA.MaxDuration == "" { + continue + } + + // Dry-run pipelines are observation-only — skip relative SLA checks. + if cfg.DryRun { + continue + } + + maxDur, err := time.ParseDuration(cfg.SLA.MaxDuration) + if err != nil { + d.Logger.Warn("invalid maxDuration in SLA config", + "pipelineId", id, "maxDuration", cfg.SLA.MaxDuration, "error", err) + continue + } + + scheduleID := resolveScheduleID(cfg) + + for _, checkDate := range datesToCheck { + checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now) + } + } + return nil +} + +// checkRelativeSLAForDate checks a single date for a relative SLA breach on +// the given pipeline. It looks up the first-sensor-arrival marker, verifies +// the breach window has elapsed, and publishes an alert if needed. +func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types.PipelineConfig, scheduleID, checkDate string, maxDur time.Duration, now time.Time) { + arrivalKey := "first-sensor-arrival#" + checkDate + arrivalData, err := d.Store.GetSensorData(ctx, id, arrivalKey) + if err != nil { + d.Logger.Error("first-sensor-arrival lookup failed", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if arrivalData == nil { + return + } + + arrivedAtStr, ok := arrivalData["arrivedAt"].(string) + if !ok || arrivedAtStr == "" { + return + } + arrivedAt, err := time.Parse(time.RFC3339, arrivedAtStr) + if err != nil { + d.Logger.Warn("invalid arrivedAt in first-sensor-arrival", + "pipelineId", id, "arrivedAt", arrivedAtStr, "error", err) + return + } + + // Check if the relative SLA has been breached. + breachAt := arrivedAt.Add(maxDur) + if now.Before(breachAt) { + return + } + + // Skip if pipeline already completed or permanently failed. + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, checkDate) + if err != nil { + d.Logger.Warn("trigger lookup failed in relative SLA check", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) { + return + } + if isJobTerminal(ctx, d, id, scheduleID, checkDate) { + return + } + + // Check dedup marker to avoid re-alerting on subsequent watchdog runs. + // The dedup key includes checkDate to avoid cross-date collisions. + dedupKey := "relative-sla-breach-check#" + checkDate + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed for relative SLA breach", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if dedupData != nil { + return + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "maxDuration": cfg.SLA.MaxDuration, + "sensorArrivalAt": arrivedAtStr, + "breachAt": breachAt.UTC().Format(time.RFC3339), + "actionHint": "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival", + } + if err := publishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate, + fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil { + d.Logger.Warn("failed to publish relative SLA breach event", + "error", err, "pipeline", id, "date", checkDate) + } + + // Write dedup marker. + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write relative SLA breach dedup marker", + "error", err, "pipeline", id, "date", checkDate) + } + + d.Logger.Info("detected relative SLA breach", + "pipelineId", id, + "schedule", scheduleID, + "date", checkDate, + "sensorArrivalAt", arrivedAtStr, + "breachAt", breachAt.UTC().Format(time.RFC3339), + ) +} diff --git a/internal/lambda/watchdog_sla.go b/internal/lambda/watchdog_sla.go new file mode 100644 index 0000000..7eab64e --- /dev/null +++ b/internal/lambda/watchdog_sla.go @@ -0,0 +1,281 @@ +package lambda + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// scheduleSLAAlerts proactively creates EventBridge Scheduler entries for all +// pipelines with SLA configs. This ensures warnings/breaches fire even when +// pipelines never trigger (data never arrives, sensor fails, etc.). +// Idempotency: deterministic scheduler names; ConflictException = already exists. +func scheduleSLAAlerts(ctx context.Context, d *Deps) error { + if d.Scheduler == nil { + return nil + } + + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + if cfg.SLA == nil { + continue + } + + // Dry-run pipelines are observation-only — skip SLA scheduling. + if cfg.DryRun { + continue + } + + if isExcluded(cfg, now) { + continue + } + + scheduleID := resolveScheduleID(cfg) + date := resolveWatchdogSLADate(cfg, now) + + // Sensor-triggered daily pipelines run T+1: data for today completes + // tomorrow, so the SLA deadline is relative to tomorrow's date. + // Only slaDate is shifted; the original date is kept for schedule + // naming, trigger lookup, and fire-alert payload so cancellation + // stays consistent with the SFN's view of the pipeline. + slaDate := date + if cfg.Schedule.Cron == "" && !strings.HasPrefix(cfg.SLA.Deadline, ":") { + t, err := time.Parse("2006-01-02", date) + if err == nil { + slaDate = t.AddDate(0, 0, 1).Format("2006-01-02") + } + } + + // Skip if pipeline already completed or permanently failed for this date. + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, date) + switch { + case err != nil: + d.Logger.Warn("trigger lookup failed in SLA scheduling", "pipelineId", id, "error", err) + continue + case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal): + continue + case isJobTerminal(ctx, d, id, scheduleID, date): + continue + } + + calc, err := handleSLACalculate(SLAMonitorInput{ + Mode: "calculate", + PipelineID: id, + ScheduleID: scheduleID, + Date: slaDate, + Deadline: cfg.SLA.Deadline, + ExpectedDuration: cfg.SLA.ExpectedDuration, + Timezone: cfg.SLA.Timezone, + }, now) + if err != nil { + d.Logger.Error("SLA calculate failed", "pipelineId", id, "error", err) + continue + } + + breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) + if breachAt.IsZero() || breachAt.After(now) { + // SLA breach is in the future — create schedules. + scheduleErr := false + if err := createSLASchedules(ctx, d, id, scheduleID, date, calc, true); err != nil { + d.Logger.Error("create SLA schedule failed", + "pipelineId", id, "error", err) + scheduleErr = true + } + + if !scheduleErr { + d.Logger.Info("proactive SLA schedules ensured", + "pipelineId", id, + "date", date, + "warningAt", calc.WarningAt, + "breachAt", calc.BreachAt, + ) + } + } + } + return nil +} + +// checkTriggerDeadlines evaluates trigger deadlines independently of SLA +// configuration. Pipelines with a Trigger.Deadline but no SLA config are +// checked here. For each pipeline, if the trigger deadline has passed and +// no trigger exists, the sensor trigger window is closed. +func checkTriggerDeadlines(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" { + continue + } + + // Dry-run pipelines are observation-only — skip trigger deadline checks. + if cfg.DryRun { + continue + } + + if isExcluded(cfg, now) { + continue + } + + scheduleID := resolveScheduleID(cfg) + triggerDate := resolveTriggerDeadlineDate(cfg, now) + + triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate) + if err != nil { + d.Logger.Warn("trigger lookup failed in deadline check", "pipelineId", id, "error", err) + continue + } + if triggerRec != nil { + continue + } + + if isJobTerminal(ctx, d, id, scheduleID, triggerDate) { + continue + } + + closeSensorTriggerWindow(ctx, d, id, scheduleID, triggerDate, cfg, now) + } + return nil +} + +// resolveWatchdogSLADate determines the execution date for SLA scheduling. +// - Hourly pipelines (relative deadline like ":30"): previous hour composite +// date, e.g. "2026-03-05T13" when the clock is 14:xx. +// - Daily pipelines (absolute deadline like "02:00"): today's date, +// so handleSLACalculate rolls the deadline forward to the next occurrence. +func resolveWatchdogSLADate(cfg *types.PipelineConfig, now time.Time) string { + if strings.HasPrefix(cfg.SLA.Deadline, ":") { + prev := now.Add(-time.Hour) + return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) + } + return now.Format("2006-01-02") +} + +// resolveTriggerDeadlineDate determines the execution date for trigger +// deadline evaluation. Uses the trigger deadline format (not SLA deadline) +// to decide between hourly composite date and daily date. +func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string { + if strings.HasPrefix(cfg.Schedule.Trigger.Deadline, ":") { + prev := now.Add(-time.Hour) + return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) + } + return now.Format("2006-01-02") +} + +// resolveTriggerDeadlineTime computes the absolute time when the trigger +// window closes for the given deadline string and execution date. +// +// For relative (hourly) deadlines like ":45" with composite date "2026-03-09T13": +// - Data for hour 13 is processed in hour 14 +// - The deadline resolves to 2026-03-09T14:45:00 in the configured timezone +// +// For absolute (daily) deadlines like "09:00" with date "2026-03-09": +// - The deadline resolves to 2026-03-09T09:00:00 in the configured timezone +// +// Unlike handleSLACalculate, this does NOT roll forward when the time is past. +// Returns zero time on parse errors. +func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time { + loc := resolveTimezone(timezone) + + if strings.HasPrefix(deadline, ":") { + // Relative (hourly): ":MM" — deadline is in the NEXT hour after the + // composite date's hour, since data for hour H is processed in hour H+1. + minute, err := strconv.Atoi(strings.TrimPrefix(deadline, ":")) + if err != nil { + return time.Time{} + } + // Parse composite date "YYYY-MM-DDThh". + if len(date) < 13 || date[10] != 'T' { + return time.Time{} + } + t, err := time.ParseInLocation("2006-01-02T15", date, loc) + if err != nil { + return time.Time{} + } + // Add 1 hour for the processing window, then set the minute. + return time.Date(t.Year(), t.Month(), t.Day(), t.Hour()+1, minute, 0, 0, loc) + } + + // Absolute (daily): "HH:MM". + parts := strings.SplitN(deadline, ":", 2) + if len(parts) != 2 { + return time.Time{} + } + hour, err := strconv.Atoi(parts[0]) + if err != nil { + return time.Time{} + } + minute, err := strconv.Atoi(parts[1]) + if err != nil { + return time.Time{} + } + t, err := time.ParseInLocation("2006-01-02", date, loc) + if err != nil { + return time.Time{} + } + return time.Date(t.Year(), t.Month(), t.Day(), hour, minute, 0, 0, loc) +} + +// closeSensorTriggerWindow checks whether the trigger deadline has passed for +// a sensor-triggered pipeline that never started. If expired, it writes a +// FAILED_FINAL trigger record (blocking future auto-triggers) and publishes +// a SENSOR_DEADLINE_EXPIRED event. A human can still restart via RERUN_REQUEST. +func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, cfg *types.PipelineConfig, now time.Time) { + // Compute the absolute trigger deadline time directly — we do NOT use + // handleSLACalculate here because it rolls daily deadlines forward 24h + // when past, which defeats the purpose of checking for expiry. + tz := cfg.Schedule.Timezone + if tz == "" && cfg.SLA != nil { + tz = cfg.SLA.Timezone + } + triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz) + if triggerDeadline.IsZero() || triggerDeadline.After(now) { + return + } + + // Use conditional put to avoid overwriting a trigger that was acquired + // between the GetTrigger read and this write (TOCTOU protection). + created, err := d.Store.CreateTriggerIfAbsent(ctx, pipelineID, scheduleID, date, types.TriggerStatusFailedFinal) + if err != nil { + d.Logger.Error("failed to write FAILED_FINAL for expired trigger deadline", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date, "error", err) + return + } + if !created { + // Trigger row appeared since the read — pipeline started, don't interfere. + d.Logger.Info("trigger appeared during deadline check, skipping window close", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date) + return + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "triggerDeadline": cfg.Schedule.Trigger.Deadline, + "actionHint": "auto-trigger window closed — use RERUN_REQUEST to restart", + } + if err := publishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date, + fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID) + } + + d.Logger.Info("sensor trigger window closed", + "pipelineId", pipelineID, + "schedule", scheduleID, + "date", date, + "triggerDeadline", cfg.Schedule.Trigger.Deadline, + ) +} diff --git a/internal/lambda/watchdog_stale.go b/internal/lambda/watchdog_stale.go new file mode 100644 index 0000000..cebfb57 --- /dev/null +++ b/internal/lambda/watchdog_stale.go @@ -0,0 +1,209 @@ +package lambda + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/dwsmith1983/interlock/internal/validation" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and +// publishes an SFN_TIMEOUT event for any that have exceeded their TTL or the +// staleTriggerThreshold. Stale triggers are moved to FAILED_FINAL status. +func detectStaleTriggers(ctx context.Context, d *Deps) error { + triggers, err := d.Store.ScanRunningTriggers(ctx) + if err != nil { + return fmt.Errorf("scan running triggers: %w", err) + } + + now := d.now() + for _, tr := range triggers { + if !isStaleTrigger(tr, now) { + continue + } + + pipelineID, schedule, date, err := parseTriggerRecord(tr) + if err != nil { + d.Logger.Warn("skipping unparseable trigger", "pk", tr.PK, "sk", tr.SK, "error", err) + continue + } + + // Dry-run pipelines should never have TRIGGER# rows, but guard + // against stale rows from pre-dry-run migrations or bugs. + if cfg, cfgErr := d.ConfigCache.Get(ctx, pipelineID); cfgErr == nil && cfg != nil && cfg.DryRun { + continue + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "actionHint": "step function exceeded TTL — check SFN execution history", + } + if tr.TTL > 0 { + alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339) + } + if err := publishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date, + fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + + if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { + d.Logger.Error("failed to set trigger status to FAILED_FINAL", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "error", err) + continue + } + + d.Logger.Info("detected stale trigger", + "pipelineId", pipelineID, + "schedule", schedule, + "date", date, + ) + } + return nil +} + +// isStaleTrigger returns true if the trigger's TTL has expired or if the TTL +// is zero and the trigger has been running longer than staleTriggerThreshold. +func isStaleTrigger(tr types.ControlRecord, now time.Time) bool { + if tr.TTL > 0 { + return now.Unix() > tr.TTL + } + // No TTL set — treat as stale if it has existed for longer than the threshold. + // Without a creation timestamp we can't be precise, so we conservatively + // consider it stale only when TTL is explicitly expired. + return false +} + +// parseTriggerRecord extracts pipeline ID, schedule, and date from a trigger +// ControlRecord's PK and SK. +// PK format: PIPELINE# +// SK format: TRIGGER## +func parseTriggerRecord(tr types.ControlRecord) (pipelineID, schedule, date string, err error) { + const pkPrefix = "PIPELINE#" + if !strings.HasPrefix(tr.PK, pkPrefix) { + return "", "", "", fmt.Errorf("unexpected PK format: %q", tr.PK) + } + pipelineID = tr.PK[len(pkPrefix):] + + const skPrefix = "TRIGGER#" + trimmed := strings.TrimPrefix(tr.SK, skPrefix) + if trimmed == tr.SK { + return "", "", "", fmt.Errorf("unexpected SK format: %q", tr.SK) + } + parts := strings.SplitN(trimmed, "#", 2) + if len(parts) != 2 { + return "", "", "", fmt.Errorf("invalid TRIGGER SK format: %q", tr.SK) + } + return pipelineID, parts[0], parts[1], nil +} + +// reconcileSensorTriggers re-evaluates trigger conditions for sensor-triggered +// pipelines. If a sensor meets the trigger condition but no trigger lock exists, +// the watchdog acquires the lock, starts the SFN, and publishes TRIGGER_RECOVERED. +// This self-heals missed triggers caused by silent completion-write failures. +func reconcileSensorTriggers(ctx context.Context, d *Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.now() + + for id, cfg := range configs { + trigger := cfg.Schedule.Trigger + if trigger == nil || cfg.Schedule.Cron != "" { + continue + } + + // Dry-run pipelines are observation-only — skip reconciliation. + if cfg.DryRun { + continue + } + + if isExcluded(cfg, now) { + continue + } + + sensors, err := d.Store.GetAllSensors(ctx, id) + if err != nil { + d.Logger.Error("failed to get sensors for reconciliation", + "pipelineId", id, "error", err) + continue + } + + scheduleID := resolveScheduleID(cfg) + + for sensorKey, sensorData := range sensors { + if !strings.HasPrefix(sensorKey, trigger.Key) { + continue + } + + rule := types.ValidationRule{ + Key: trigger.Key, + Check: trigger.Check, + Field: trigger.Field, + Value: trigger.Value, + } + result := validation.EvaluateRule(rule, sensorData, now) + if !result.Passed { + continue + } + + date := ResolveExecutionDate(sensorData, now) + + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) + if err != nil { + d.Logger.Error("trigger check failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + if found { + continue + } + + // Guard against re-triggering completed pipelines whose trigger + // record was deleted by DynamoDB TTL. Check the joblog for a + // terminal event before acquiring a new lock. + if isJobTerminal(ctx, d, id, scheduleID, date) { + continue + } + + acquired, err := d.Store.AcquireTriggerLock(ctx, id, scheduleID, date, ResolveTriggerLockTTL()) + if err != nil { + d.Logger.Error("lock acquisition failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + if !acquired { + continue + } + + if err := startSFN(ctx, d, cfg, id, scheduleID, date); err != nil { + if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil { + d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr) + } + d.Logger.Error("SFN start failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + + alertDetail := map[string]interface{}{ + "source": "reconciliation", + "actionHint": "watchdog recovered missed sensor trigger", + } + if err := publishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date, + fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date) + } + + d.Logger.Info("recovered missed trigger", + "pipelineId", id, + "schedule", scheduleID, + "date", date, + ) + } + } + return nil +} diff --git a/internal/lambda/watchdog_test.go b/internal/lambda/watchdog_test.go index baad19c..bc77b82 100644 --- a/internal/lambda/watchdog_test.go +++ b/internal/lambda/watchdog_test.go @@ -1875,9 +1875,9 @@ func TestWatchdog_PostRunSensorMissing(t *testing.T) { // Seed COMPLETED trigger for today. seedTriggerWithStatus(mock, "gold-revenue", today, types.TriggerStatusCompleted) - // Seed baseline (written at completion time). + // Seed baseline (written at completion time, namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+today, map[string]interface{}{ - "sensor_count": float64(100), + "quality-check": map[string]interface{}{"sensor_count": float64(100)}, }) // Seed a success job event with timestamp 3h before now (well past the 1h timeout). @@ -1925,9 +1925,9 @@ func TestWatchdog_PostRunSensorPresent(t *testing.T) { // Seed COMPLETED trigger for today. seedTriggerWithStatus(mock, "gold-revenue", today, types.TriggerStatusCompleted) - // Seed baseline. + // Seed baseline (namespaced by rule key). seedSensor(mock, "gold-revenue", "postrun-baseline#"+today, map[string]interface{}{ - "sensor_count": float64(100), + "quality-check": map[string]interface{}{"sensor_count": float64(100)}, }) // Seed a success job event 3h before now. @@ -3189,3 +3189,165 @@ func TestWatchdog_DryRun_SkipsAllSchedulingAndAlerts(t *testing.T) { "dry-run pipeline must not produce %s events", prohibited) } } + +// TestResolveTriggerDeadlineTime_UsesScheduleTimezone verifies that the trigger +// deadline is resolved in the schedule's timezone, not the SLA timezone. +// BUG-6: closeSensorTriggerWindow previously used cfg.SLA.Timezone exclusively, +// ignoring cfg.Schedule.Timezone. The fix prefers Schedule.Timezone with SLA as +// fallback. +func TestResolveTriggerDeadlineTime_UsesScheduleTimezone(t *testing.T) { + tests := []struct { + name string + deadline string + date string + timezone string + wantHour int + wantMin int + wantTZ string + }{ + { + name: "daily deadline in US/Eastern", + deadline: "09:00", + date: "2026-03-09", + timezone: "US/Eastern", + wantHour: 9, + wantMin: 0, + wantTZ: "EDT", + }, + { + name: "daily deadline in Europe/Berlin", + deadline: "09:00", + date: "2026-03-09", + timezone: "Europe/Berlin", + wantHour: 9, + wantMin: 0, + wantTZ: "CET", + }, + { + name: "hourly deadline in Asia/Tokyo", + deadline: ":45", + date: "2026-03-09T13", + timezone: "Asia/Tokyo", + wantHour: 14, // hour+1 for processing window + wantMin: 45, + wantTZ: "JST", + }, + { + name: "empty timezone falls back to UTC", + deadline: "09:00", + date: "2026-03-09", + timezone: "", + wantHour: 9, + wantMin: 0, + wantTZ: "UTC", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := lambda.ResolveTriggerDeadlineTime(tt.deadline, tt.date, tt.timezone) + require.False(t, got.IsZero(), "expected non-zero time") + assert.Equal(t, tt.wantHour, got.Hour(), "hour mismatch") + assert.Equal(t, tt.wantMin, got.Minute(), "minute mismatch") + zoneName, _ := got.Zone() + assert.Equal(t, tt.wantTZ, zoneName, "timezone mismatch") + }) + } +} + +// TestCloseSensorTriggerWindow_PrefersScheduleTimezone is an integration-level +// test verifying that closeSensorTriggerWindow resolves the trigger deadline in +// the schedule timezone rather than the SLA timezone when both are set. +// +// BUG-6: With Schedule.Timezone="US/Eastern" (UTC-4 in March) and +// SLA.Timezone="Asia/Tokyo" (UTC+9), a 09:00 trigger deadline should resolve +// to 09:00 US/Eastern (13:00 UTC), NOT 09:00 Asia/Tokyo (00:00 UTC). +func TestCloseSensorTriggerWindow_PrefersScheduleTimezone(t *testing.T) { + mock := newMockDDB() + d, _, ebMock := testDeps(mock) + schedMock := &mockScheduler{} + d.Scheduler = schedMock + d.SLAMonitorARN = "arn:aws:lambda:us-east-1:123:function:sla-monitor" + d.SchedulerRoleARN = "arn:aws:iam::123:role/scheduler-role" + d.SchedulerGroupName = "interlock-sla" + + // Fix time at 13:30 UTC on 2026-03-09. In US/Eastern (EDT, UTC-4), + // this is 09:30 — past the 09:00 trigger deadline. + // In Asia/Tokyo (JST, UTC+9), 09:00 JST = 00:00 UTC on 2026-03-09, + // so 13:30 UTC is also past 09:00 JST. + // + // The critical test: at 12:30 UTC (08:30 Eastern), the deadline should + // NOT have expired in the schedule timezone (US/Eastern), even though it + // would have expired if resolved in Asia/Tokyo. + beforeDeadlineUTC := time.Date(2026, 3, 9, 12, 30, 0, 0, time.UTC) + d.NowFunc = func() time.Time { return beforeDeadlineUTC } + d.StartedAt = beforeDeadlineUTC.Add(-5 * time.Minute) + + cfg := types.PipelineConfig{ + Pipeline: types.PipelineIdentity{ID: "tz-bug6-pipeline"}, + Schedule: types.ScheduleConfig{ + Timezone: "US/Eastern", // EDT = UTC-4 in March + Trigger: &types.TriggerCondition{ + Key: "sensor-data", + Check: "equals", + Field: "ready", + Value: true, + Deadline: "09:00", // 09:00 Eastern = 13:00 UTC + }, + Evaluation: types.EvaluationWindow{Window: "1h", Interval: "5m"}, + }, + SLA: &types.SLAConfig{ + Deadline: "10:00", + Timezone: "Asia/Tokyo", // JST = UTC+9; 09:00 JST = 00:00 UTC + }, + Validation: types.ValidationConfig{Trigger: "ALL"}, + Job: types.JobConfig{Type: "command", Config: map[string]interface{}{"command": "echo hello"}}, + } + seedConfig(mock, cfg) + + err := lambda.HandleWatchdog(context.Background(), d) + require.NoError(t, err) + + // At 12:30 UTC = 08:30 Eastern, the 09:00 Eastern deadline has NOT + // expired. No SENSOR_DEADLINE_EXPIRED event should be published. + // (Under the old buggy code that used SLA.Timezone=Asia/Tokyo, + // 09:00 JST = 00:00 UTC, so it would have considered the deadline + // expired and published the event.) + ebMock.mu.Lock() + for _, ev := range ebMock.events { + assert.NotEqual(t, string(types.EventSensorDeadlineExpired), *ev.Entries[0].DetailType, + "deadline should NOT be expired at 08:30 Eastern (12:30 UTC)") + } + ebMock.mu.Unlock() + + // Now advance to 13:30 UTC = 09:30 Eastern — past the 09:00 Eastern deadline. + afterDeadlineUTC := time.Date(2026, 3, 9, 13, 30, 0, 0, time.UTC) + d.NowFunc = func() time.Time { return afterDeadlineUTC } + d.StartedAt = afterDeadlineUTC.Add(-5 * time.Minute) + + // Reset mock state for fresh run. + mock2 := newMockDDB() + d2, _, ebMock2 := testDeps(mock2) + d2.Scheduler = &mockScheduler{} + d2.SLAMonitorARN = d.SLAMonitorARN + d2.SchedulerRoleARN = d.SchedulerRoleARN + d2.SchedulerGroupName = d.SchedulerGroupName + d2.NowFunc = func() time.Time { return afterDeadlineUTC } + d2.StartedAt = afterDeadlineUTC.Add(-5 * time.Minute) + seedConfig(mock2, cfg) + + err = lambda.HandleWatchdog(context.Background(), d2) + require.NoError(t, err) + + // At 13:30 UTC = 09:30 Eastern, the 09:00 Eastern deadline IS expired. + // SENSOR_DEADLINE_EXPIRED should be published. + ebMock2.mu.Lock() + defer ebMock2.mu.Unlock() + var found bool + for _, ev := range ebMock2.events { + if *ev.Entries[0].DetailType == string(types.EventSensorDeadlineExpired) { + found = true + break + } + } + assert.True(t, found, "expected SENSOR_DEADLINE_EXPIRED at 09:30 Eastern (13:30 UTC)") +} diff --git a/internal/trigger/airflow.go b/internal/trigger/airflow.go index f78708a..589233e 100644 --- a/internal/trigger/airflow.go +++ b/internal/trigger/airflow.go @@ -9,7 +9,6 @@ import ( "net/http" "os" "strings" - "time" "github.com/dwsmith1983/interlock/pkg/types" ) @@ -49,15 +48,7 @@ func ExecuteAirflow(ctx context.Context, cfg *types.AirflowTriggerConfig) (map[s req.Header.Set(k, os.Expand(v, safeEnvLookup)) } - client := defaultHTTPClient - if cfg.Timeout > 0 { - timeout := time.Duration(cfg.Timeout) * time.Second - if timeout != defaultTriggerTimeout { - client = &http.Client{Timeout: timeout} - } - } - - resp, err := client.Do(req) + resp, err := resolveHTTPClient(cfg.Timeout).Do(req) if err != nil { return nil, fmt.Errorf("airflow trigger: request failed: %w", err) } diff --git a/internal/trigger/airflow_test.go b/internal/trigger/airflow_test.go index 35b9cce..69d7651 100644 --- a/internal/trigger/airflow_test.go +++ b/internal/trigger/airflow_test.go @@ -26,6 +26,10 @@ func TestExecuteAirflow_Success(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -49,6 +53,10 @@ func TestExecuteAirflow_AuthHeader(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "test_dag", @@ -67,6 +75,10 @@ func TestExecuteAirflow_ServerError(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -91,6 +103,10 @@ func TestCheckAirflowStatus_Success(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil) require.NoError(t, err) assert.Equal(t, "success", state) @@ -105,6 +121,10 @@ func TestCheckAirflowStatus_Running(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil) require.NoError(t, err) assert.Equal(t, "running", state) @@ -119,6 +139,10 @@ func TestCheckAirflowStatus_Failed(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil) require.NoError(t, err) assert.Equal(t, "failed", state) @@ -151,6 +175,10 @@ func TestExecuteAirflow_WithBody(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -183,6 +211,10 @@ func TestExecuteAirflow_MissingDagRunIDInResponse(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -201,6 +233,10 @@ func TestExecuteAirflow_CustomTimeout(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "my_dag", @@ -218,6 +254,10 @@ func TestCheckAirflowStatus_ServerError(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + _, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", nil) assert.Error(t, err) assert.Contains(t, err.Error(), "status 500") @@ -232,6 +272,10 @@ func TestCheckAirflowStatus_MissingStateField(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + _, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", nil) assert.Error(t, err) assert.Contains(t, err.Error(), "response missing state field") @@ -255,6 +299,10 @@ func TestExecuteAirflow_EnvExpansionRestricted(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.AirflowTriggerConfig{ URL: srv.URL, DagID: "test_dag", @@ -287,6 +335,10 @@ func TestCheckAirflowStatus_EnvExpansionRestricted(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + headers := map[string]string{"Authorization": "Bearer ${INTERLOCK_TEST_VAR}/${SECRET_VAR}"} state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", headers) require.NoError(t, err) @@ -306,6 +358,10 @@ func TestCheckAirflowStatus_WithHeaders(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", map[string]string{ "Authorization": "Bearer test-token", }) diff --git a/internal/trigger/runner_test.go b/internal/trigger/runner_test.go index 34b12a4..97c817d 100644 --- a/internal/trigger/runner_test.go +++ b/internal/trigger/runner_test.go @@ -144,6 +144,10 @@ func TestRunner_Execute_HTTPType(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + r := NewRunner() _, err := r.Execute(context.Background(), &types.TriggerConfig{ Type: types.TriggerHTTP, @@ -170,6 +174,10 @@ func TestRunner_Execute_AirflowType(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + r := NewRunner() meta, err := r.Execute(context.Background(), &types.TriggerConfig{ Type: types.TriggerAirflow, diff --git a/internal/trigger/ssrf.go b/internal/trigger/ssrf.go new file mode 100644 index 0000000..164ccb8 --- /dev/null +++ b/internal/trigger/ssrf.go @@ -0,0 +1,49 @@ +package trigger + +import ( + "fmt" + "net" + "net/http" + "syscall" + "time" +) + +// newSSRFSafeTransport clones http.DefaultTransport (preserving HTTP/2, +// keep-alive, idle-conn settings) and replaces the dialer with one whose +// Control hook rejects connections to private/loopback/link-local IPs. +func newSSRFSafeTransport() *http.Transport { + base := http.DefaultTransport.(*http.Transport).Clone() + base.DialContext = (&net.Dialer{ + Timeout: base.TLSHandshakeTimeout, // match the original dialer timeout + KeepAlive: 30 * time.Second, + Control: ssrfDialControl, + }).DialContext + return base +} + +func ssrfDialControl(network, address string, _ syscall.RawConn) error { + host, _, err := net.SplitHostPort(address) + if err != nil { + return fmt.Errorf("ssrf: invalid address %q: %w", address, err) + } + ip := net.ParseIP(host) + if ip == nil { + return fmt.Errorf("ssrf: could not parse IP %q", host) + } + if isBlockedIP(ip) { + return fmt.Errorf("ssrf: connection to %s blocked (private/loopback/link-local)", ip) + } + return nil +} + +func isBlockedIP(ip net.IP) bool { + return ip.IsLoopback() || + ip.IsPrivate() || + ip.IsLinkLocalUnicast() || + ip.IsMulticast() || + ip.IsUnspecified() || + // Explicit IMDS/ECS checks — already covered by IsLinkLocalUnicast + // but kept for visibility since these are the primary SSRF targets. + ip.Equal(net.ParseIP("169.254.169.254")) || + ip.Equal(net.ParseIP("169.254.170.2")) +} diff --git a/internal/trigger/ssrf_test.go b/internal/trigger/ssrf_test.go new file mode 100644 index 0000000..3d420b2 --- /dev/null +++ b/internal/trigger/ssrf_test.go @@ -0,0 +1,77 @@ +package trigger + +import ( + "net" + "testing" +) + +func TestIsBlockedIP(t *testing.T) { + blocked := []struct { + name string + ip string + }{ + {"loopback_v4", "127.0.0.1"}, + {"private_10", "10.0.0.1"}, + {"private_172", "172.16.0.1"}, + {"private_192", "192.168.1.1"}, + {"aws_imds", "169.254.169.254"}, + {"ecs_metadata", "169.254.170.2"}, + {"loopback_v6", "::1"}, + {"link_local_v6", "fe80::1"}, + {"unspecified", "0.0.0.0"}, + } + for _, tc := range blocked { + t.Run(tc.name, func(t *testing.T) { + ip := net.ParseIP(tc.ip) + if ip == nil { + t.Fatalf("failed to parse IP %s", tc.ip) + } + if !isBlockedIP(ip) { + t.Errorf("expected %s to be blocked", tc.ip) + } + }) + } + + allowed := []struct { + name string + ip string + }{ + {"google_dns", "8.8.8.8"}, + {"aws_public", "52.94.76.1"}, + {"google_v6", "2607:f8b0:4004:800::200e"}, + } + for _, tc := range allowed { + t.Run(tc.name, func(t *testing.T) { + ip := net.ParseIP(tc.ip) + if ip == nil { + t.Fatalf("failed to parse IP %s", tc.ip) + } + if isBlockedIP(ip) { + t.Errorf("expected %s to be allowed", tc.ip) + } + }) + } +} + +func TestSSRFDialControl(t *testing.T) { + t.Run("blocks_loopback", func(t *testing.T) { + err := ssrfDialControl("tcp", "127.0.0.1:80", nil) + if err == nil { + t.Error("expected error for loopback address") + } + }) + + t.Run("allows_public", func(t *testing.T) { + err := ssrfDialControl("tcp", "8.8.8.8:443", nil) + if err != nil { + t.Errorf("expected no error for public address, got: %v", err) + } + }) + + t.Run("blocks_imds", func(t *testing.T) { + err := ssrfDialControl("tcp", "169.254.169.254:80", nil) + if err == nil { + t.Error("expected error for IMDS address") + } + }) +} diff --git a/internal/trigger/trigger.go b/internal/trigger/trigger.go index 545a70d..cefa6fe 100644 --- a/internal/trigger/trigger.go +++ b/internal/trigger/trigger.go @@ -45,7 +45,29 @@ const maxErrorBodyBytes = 512 const defaultTriggerTimeout = 30 * time.Second // defaultHTTPClient is shared across HTTP and Airflow triggers to reuse connections. -var defaultHTTPClient = &http.Client{Timeout: defaultTriggerTimeout} +// It uses an SSRF-safe transport that rejects private, loopback, and link-local addresses. +var defaultHTTPClient = &http.Client{ + Timeout: defaultTriggerTimeout, + Transport: newSSRFSafeTransport(), +} + +// resolveHTTPClient returns a client with the given timeout in seconds. If +// timeoutSec is zero or matches the default, the shared defaultHTTPClient is +// returned to reuse connections. When a custom timeout is required, the returned +// client inherits the transport from defaultHTTPClient so that transport-level +// settings (including SSRF protection and test overrides) are preserved. +func resolveHTTPClient(timeoutSec int) *http.Client { + if timeoutSec > 0 { + timeout := time.Duration(timeoutSec) * time.Second + if timeout != defaultTriggerTimeout { + return &http.Client{ + Timeout: timeout, + Transport: defaultHTTPClient.Transport, + } + } + } + return defaultHTTPClient +} // defaultRunner provides backward-compatible package-level functions. var defaultRunner = NewRunner() @@ -60,13 +82,16 @@ func CheckStatus(ctx context.Context, triggerType types.TriggerType, metadata ma return defaultRunner.CheckStatus(ctx, triggerType, metadata, headers) } -// ExecuteCommand runs a shell command trigger. +// ExecuteCommand runs a command trigger by splitting the command string into +// arguments and executing the binary directly (no shell). This prevents shell +// metacharacter injection. func ExecuteCommand(ctx context.Context, command string) error { if command == "" { return fmt.Errorf("trigger command is empty") } - cmd := exec.CommandContext(ctx, "sh", "-c", command) + args := strings.Fields(command) + cmd := exec.CommandContext(ctx, args[0], args[1:]...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr return cmd.Run() @@ -94,14 +119,7 @@ func ExecuteHTTP(ctx context.Context, cfg *types.HTTPTriggerConfig) error { req.Header.Set(k, os.Expand(v, safeEnvLookup)) } - client := defaultHTTPClient - if cfg.Timeout > 0 { - timeout := time.Duration(cfg.Timeout) * time.Second - if timeout != defaultTriggerTimeout { - client = &http.Client{Timeout: timeout} - } - } - resp, err := client.Do(req) + resp, err := resolveHTTPClient(cfg.Timeout).Do(req) if err != nil { return fmt.Errorf("trigger request failed: %w", err) } diff --git a/internal/trigger/trigger_test.go b/internal/trigger/trigger_test.go index 9b1066d..84efb65 100644 --- a/internal/trigger/trigger_test.go +++ b/internal/trigger/trigger_test.go @@ -38,6 +38,10 @@ func TestExecuteHTTP_Success(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "POST", URL: srv.URL}, @@ -82,6 +86,10 @@ func TestExecuteHTTP_ErrorBodyTruncated(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL}, @@ -110,6 +118,10 @@ func TestExecuteHTTP_ErrorBodySanitized(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL}, @@ -154,6 +166,10 @@ func TestExecuteHTTP_EnvExpansionRestricted(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{ @@ -229,6 +245,10 @@ func TestExecuteHTTP_Returns_TriggerError_On4xx(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL}, @@ -249,6 +269,10 @@ func TestExecuteHTTP_Returns_TriggerError_On5xx(t *testing.T) { })) defer srv.Close() + origClient := defaultHTTPClient + defaultHTTPClient = srv.Client() + defer func() { defaultHTTPClient = origClient }() + cfg := &types.TriggerConfig{ Type: types.TriggerHTTP, HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL}, @@ -267,3 +291,18 @@ func TestExecuteCommand_EmptyCommand(t *testing.T) { assert.Error(t, err) assert.Contains(t, err.Error(), "command is empty") } + +func TestExecuteCommand_DirectExec(t *testing.T) { + err := ExecuteCommand(context.Background(), "echo hello") + require.NoError(t, err) +} + +func TestExecuteCommand_NoShellMetacharacters(t *testing.T) { + // The semicolon should be passed as a literal argument to echo, not + // interpreted as a shell command separator. With direct exec there is + // no shell to split on ";", so echo receives [";", "ls"] as arguments + // and prints them literally. If a shell were involved, "ls" would + // execute as a separate command. + err := ExecuteCommand(context.Background(), "echo ; ls") + require.NoError(t, err, "echo should succeed even with ; in args") +} diff --git a/internal/validation/engine.go b/internal/validation/engine.go index 0979e1b..b693aca 100644 --- a/internal/validation/engine.go +++ b/internal/validation/engine.go @@ -4,6 +4,7 @@ package validation import ( "fmt" "strconv" + "strings" "time" "github.com/dwsmith1983/interlock/pkg/types" @@ -38,7 +39,7 @@ func EvaluateRules(mode string, rules []types.ValidationRule, sensors map[string } var passed bool - switch mode { + switch strings.ToUpper(mode) { case "ANY": passed = passCount > 0 default: // "ALL" diff --git a/internal/validation/engine_test.go b/internal/validation/engine_test.go index 2d1f7cb..1791321 100644 --- a/internal/validation/engine_test.go +++ b/internal/validation/engine_test.go @@ -448,6 +448,36 @@ func TestToFloat64_Unsupported(t *testing.T) { assert.False(t, ok) } +// --- BUG-8 characterization: lowercase "any" treated as "ALL" --- + +func TestEvaluateRules_LowercaseAny_TreatedAsAll(t *testing.T) { + // BUG-8 FIXED: lowercase "any" now works via strings.ToUpper. + rules := []types.ValidationRule{ + {Key: "s1", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + {Key: "s2", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + } + sensors := map[string]map[string]interface{}{ + "s1": {"count": float64(20)}, // passes + "s2": {"count": float64(5)}, // fails + } + result := EvaluateRules("any", rules, sensors, time.Now()) + assert.True(t, result.Passed, "BUG-8 FIXED: lowercase 'any' now works") +} + +func TestEvaluateRules_MixedCaseAny_TreatedAsAll(t *testing.T) { + // BUG-8 FIXED: mixed-case "Any" now works via strings.ToUpper. + rules := []types.ValidationRule{ + {Key: "s1", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + {Key: "s2", Check: types.CheckGTE, Field: "count", Value: float64(10)}, + } + sensors := map[string]map[string]interface{}{ + "s1": {"count": float64(20)}, + "s2": {"count": float64(5)}, + } + result := EvaluateRules("Any", rules, sensors, time.Now()) + assert.True(t, result.Passed, "BUG-8 FIXED: lowercase 'any' now works") +} + // --- EvaluateRules default mode (not "ALL" or "ANY") --- func TestEvaluateRules_DefaultMode_FallsToALL(t *testing.T) { diff --git a/pkg/types/events.go b/pkg/types/events.go index dd0c2cb..f712ea9 100644 --- a/pkg/types/events.go +++ b/pkg/types/events.go @@ -42,6 +42,10 @@ const ( EventDryRunSLAProjection EventDetailType = "DRY_RUN_SLA_PROJECTION" EventDryRunDrift EventDetailType = "DRY_RUN_DRIFT" EventDryRunCompleted EventDetailType = "DRY_RUN_COMPLETED" + EventDryRunWouldRerun EventDetailType = "DRY_RUN_WOULD_RERUN" + EventDryRunRerunRejected EventDetailType = "DRY_RUN_RERUN_REJECTED" + EventDryRunWouldRetry EventDetailType = "DRY_RUN_WOULD_RETRY" + EventDryRunRetryExhausted EventDetailType = "DRY_RUN_RETRY_EXHAUSTED" ) // EventSource is the EventBridge source for all interlock events.