diff --git a/CHANGELOG.md b/CHANGELOG.md index d12aee0..a70b591 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.4] - 2026-03-29 + +### Refactored + +- **Split `internal/lambda/` into handler-aligned sub-packages** — Monolithic package replaced with focused sub-packages: `orchestrator/`, `stream/`, `watchdog/`, `sla/`, `alert/`, `sink/`. +- **Extracted shared utilities into focused root files** — Common logic moved to dedicated files: publish, date, exclusion, sensor, schedule, config, terminal. +- **Trigger config registry** — Replaced `buildTriggerConfig` switch statement with generic registry map (`trigger_registry.go`). +- **SLA deadline calculations wired through `pkg/sla/`** — Pure functions for SLA deadline resolution, decoupled from Lambda handler context. + +### Added + +- **`pkg/sla/` package** — Pure SLA deadline calculation functions usable across packages. +- **`PipelineConfig.DeepCopy()` method** — Safe config cache isolation without JSON marshal/unmarshal roundtrip. +- **`EventWatchdogDegraded` event type** — Watchdog health observability event for degraded-state detection. +- **Smoke tests for all 6 `cmd/lambda/` packages** — `ValidateEnv` coverage for every Lambda entry point. + +### Fixed + +- **`HandleWatchdog` silent error suppression** — Now returns aggregate errors via `errors.Join` instead of silently returning nil. +- **`HandleWatchdog` degraded-state signaling** — Publishes `WATCHDOG_DEGRADED` event when checks fail. +- **Config cache isolation** — Uses typed `DeepCopy()` instead of JSON marshal/unmarshal roundtrip, eliminating silent data loss on unexported fields. + ## [0.9.3] - 2026-03-14 ### Changed diff --git a/README.md b/README.md index 734f9dc..b75ff58 100644 --- a/README.md +++ b/README.md @@ -237,12 +237,20 @@ interlock/ │ ├── watchdog/ # Missed schedule + stale run detection │ ├── event-sink/ # EventBridge → events table │ └── alert-dispatcher/ # SQS → Slack (Bot API with threading) -├── pkg/types/ # Public domain types (pipeline config, events, DynamoDB keys) +├── pkg/ +│ ├── types/ # Public domain types (pipeline config, events, DynamoDB keys) +│ ├── validation/ # Declarative validation rule engine +│ └── sla/ # Pure SLA deadline calculations ├── internal/ -│ ├── lambda/ # Lambda handler logic + shared types +│ ├── lambda/ # Shared types, interfaces, utilities +│ │ ├── orchestrator/ # Evaluate, trigger, check-job handlers +│ │ ├── stream/ # DynamoDB stream routing, reruns, post-run +│ │ ├── watchdog/ # Stale trigger + missed schedule detection +│ │ ├── sla/ # SLA deadline calculation + alerts +│ │ ├── alert/ # Slack notification formatting +│ │ └── sink/ # EventBridge event persistence │ ├── store/ # DynamoDB storage layer (3-table design) │ ├── config/ # Pipeline YAML config loading -│ ├── validation/ # Declarative validation rule engine │ ├── trigger/ # Trigger execution (8 types) │ └── calendar/ # Calendar exclusion registry ├── deploy/ diff --git a/cmd/lambda/alert-dispatcher/main.go b/cmd/lambda/alert-dispatcher/main.go index b2fde15..85a74eb 100644 --- a/cmd/lambda/alert-dispatcher/main.go +++ b/cmd/lambda/alert-dispatcher/main.go @@ -18,6 +18,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/secretsmanager" ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/internal/lambda/alert" "github.com/dwsmith1983/interlock/internal/store" ) @@ -81,6 +82,6 @@ func main() { } lambda.Start(func(ctx context.Context, sqsEvent events.SQSEvent) (events.SQSEventResponse, error) { - return ilambda.HandleAlertDispatcher(ctx, deps, sqsEvent) + return alert.HandleAlertDispatcher(ctx, deps, sqsEvent) }) } diff --git a/cmd/lambda/alert-dispatcher/main_test.go b/cmd/lambda/alert-dispatcher/main_test.go new file mode 100644 index 0000000..cf25ec2 --- /dev/null +++ b/cmd/lambda/alert-dispatcher/main_test.go @@ -0,0 +1,31 @@ +package main + +import ( + "testing" + + ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateEnv_MissingVars(t *testing.T) { + err := ilambda.ValidateEnv("alert-dispatcher") + assert.Error(t, err, "should report missing env vars") + assert.Contains(t, err.Error(), "SLACK_CHANNEL_ID") + assert.Contains(t, err.Error(), "EVENTS_TABLE") + assert.Contains(t, err.Error(), "EVENTS_TTL_DAYS") +} + +func TestValidateEnv_AllSet(t *testing.T) { + envVars := map[string]string{ + "SLACK_CHANNEL_ID": "C12345", + "EVENTS_TABLE": "events", + "EVENTS_TTL_DAYS": "90", + } + for k, v := range envVars { + t.Setenv(k, v) + } + + err := ilambda.ValidateEnv("alert-dispatcher") + require.NoError(t, err, "should pass when all env vars are set") +} diff --git a/cmd/lambda/event-sink/main.go b/cmd/lambda/event-sink/main.go index 7bf9ec0..889ac9b 100644 --- a/cmd/lambda/event-sink/main.go +++ b/cmd/lambda/event-sink/main.go @@ -13,6 +13,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/dynamodb" ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/internal/lambda/sink" "github.com/dwsmith1983/interlock/internal/store" ) @@ -50,6 +51,6 @@ func main() { } lambda.Start(func(ctx context.Context, input ilambda.EventBridgeInput) error { - return ilambda.HandleEventSink(ctx, deps, input) + return sink.HandleEventSink(ctx, deps, input) }) } diff --git a/cmd/lambda/event-sink/main_test.go b/cmd/lambda/event-sink/main_test.go new file mode 100644 index 0000000..70fc36d --- /dev/null +++ b/cmd/lambda/event-sink/main_test.go @@ -0,0 +1,22 @@ +package main + +import ( + "testing" + + ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateEnv_MissingVars(t *testing.T) { + err := ilambda.ValidateEnv("event-sink") + assert.Error(t, err, "should report missing env vars") + assert.Contains(t, err.Error(), "EVENTS_TABLE") +} + +func TestValidateEnv_AllSet(t *testing.T) { + t.Setenv("EVENTS_TABLE", "events") + + err := ilambda.ValidateEnv("event-sink") + require.NoError(t, err, "should pass when all env vars are set") +} diff --git a/cmd/lambda/orchestrator/main.go b/cmd/lambda/orchestrator/main.go index 54423ab..0bbe4b8 100644 --- a/cmd/lambda/orchestrator/main.go +++ b/cmd/lambda/orchestrator/main.go @@ -16,6 +16,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/sfn" ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/internal/lambda/orchestrator" "github.com/dwsmith1983/interlock/internal/store" "github.com/dwsmith1983/interlock/internal/trigger" "github.com/dwsmith1983/interlock/pkg/types" @@ -75,6 +76,6 @@ func main() { } lambda.Start(func(ctx context.Context, input ilambda.OrchestratorInput) (ilambda.OrchestratorOutput, error) { - return ilambda.HandleOrchestrator(ctx, deps, input) + return orchestrator.HandleOrchestrator(ctx, deps, input) }) } diff --git a/cmd/lambda/orchestrator/main_test.go b/cmd/lambda/orchestrator/main_test.go new file mode 100644 index 0000000..1264d26 --- /dev/null +++ b/cmd/lambda/orchestrator/main_test.go @@ -0,0 +1,33 @@ +package main + +import ( + "testing" + + ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateEnv_MissingVars(t *testing.T) { + err := ilambda.ValidateEnv("orchestrator") + assert.Error(t, err, "should report missing env vars") + assert.Contains(t, err.Error(), "CONTROL_TABLE") + assert.Contains(t, err.Error(), "JOBLOG_TABLE") + assert.Contains(t, err.Error(), "RERUN_TABLE") + assert.Contains(t, err.Error(), "EVENT_BUS_NAME") +} + +func TestValidateEnv_AllSet(t *testing.T) { + envVars := map[string]string{ + "CONTROL_TABLE": "ctl", + "JOBLOG_TABLE": "jl", + "RERUN_TABLE": "rr", + "EVENT_BUS_NAME": "bus", + } + for k, v := range envVars { + t.Setenv(k, v) + } + + err := ilambda.ValidateEnv("orchestrator") + require.NoError(t, err, "should pass when all env vars are set") +} diff --git a/cmd/lambda/sla-monitor/main.go b/cmd/lambda/sla-monitor/main.go index 2a1ee13..482a2a3 100644 --- a/cmd/lambda/sla-monitor/main.go +++ b/cmd/lambda/sla-monitor/main.go @@ -15,6 +15,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/scheduler" ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/internal/lambda/sla" "github.com/dwsmith1983/interlock/internal/store" ) @@ -52,6 +53,6 @@ func main() { } lambda.Start(func(ctx context.Context, input ilambda.SLAMonitorInput) (ilambda.SLAMonitorOutput, error) { - return ilambda.HandleSLAMonitor(ctx, deps, input) + return sla.HandleSLAMonitor(ctx, deps, input) }) } diff --git a/cmd/lambda/sla-monitor/main_test.go b/cmd/lambda/sla-monitor/main_test.go new file mode 100644 index 0000000..ae3608c --- /dev/null +++ b/cmd/lambda/sla-monitor/main_test.go @@ -0,0 +1,39 @@ +package main + +import ( + "testing" + + ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateEnv_MissingVars(t *testing.T) { + err := ilambda.ValidateEnv("sla-monitor") + assert.Error(t, err, "should report missing env vars") + assert.Contains(t, err.Error(), "CONTROL_TABLE") + assert.Contains(t, err.Error(), "JOBLOG_TABLE") + assert.Contains(t, err.Error(), "RERUN_TABLE") + assert.Contains(t, err.Error(), "EVENT_BUS_NAME") + assert.Contains(t, err.Error(), "SLA_MONITOR_ARN") + assert.Contains(t, err.Error(), "SCHEDULER_ROLE_ARN") + assert.Contains(t, err.Error(), "SCHEDULER_GROUP_NAME") +} + +func TestValidateEnv_AllSet(t *testing.T) { + envVars := map[string]string{ + "CONTROL_TABLE": "ctl", + "JOBLOG_TABLE": "jl", + "RERUN_TABLE": "rr", + "EVENT_BUS_NAME": "bus", + "SLA_MONITOR_ARN": "arn:aws:lambda:us-east-1:123:function:sla", + "SCHEDULER_ROLE_ARN": "arn:aws:iam::123:role/sched", + "SCHEDULER_GROUP_NAME": "interlock", + } + for k, v := range envVars { + t.Setenv(k, v) + } + + err := ilambda.ValidateEnv("sla-monitor") + require.NoError(t, err, "should pass when all env vars are set") +} diff --git a/cmd/lambda/stream-router/main.go b/cmd/lambda/stream-router/main.go index 60c2430..35332a2 100644 --- a/cmd/lambda/stream-router/main.go +++ b/cmd/lambda/stream-router/main.go @@ -18,6 +18,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/sfn" ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/internal/lambda/stream" "github.com/dwsmith1983/interlock/internal/store" ) @@ -55,6 +56,6 @@ func main() { } lambda.Start(func(ctx context.Context, event ilambda.StreamEvent) (events.DynamoDBEventResponse, error) { - return ilambda.HandleStreamEvent(ctx, deps, event) + return stream.HandleStreamEvent(ctx, deps, event) }) } diff --git a/cmd/lambda/stream-router/main_test.go b/cmd/lambda/stream-router/main_test.go new file mode 100644 index 0000000..45a7977 --- /dev/null +++ b/cmd/lambda/stream-router/main_test.go @@ -0,0 +1,35 @@ +package main + +import ( + "testing" + + ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateEnv_MissingVars(t *testing.T) { + err := ilambda.ValidateEnv("stream-router") + assert.Error(t, err, "should report missing env vars") + assert.Contains(t, err.Error(), "CONTROL_TABLE") + assert.Contains(t, err.Error(), "JOBLOG_TABLE") + assert.Contains(t, err.Error(), "RERUN_TABLE") + assert.Contains(t, err.Error(), "STATE_MACHINE_ARN") + assert.Contains(t, err.Error(), "EVENT_BUS_NAME") +} + +func TestValidateEnv_AllSet(t *testing.T) { + envVars := map[string]string{ + "CONTROL_TABLE": "ctl", + "JOBLOG_TABLE": "jl", + "RERUN_TABLE": "rr", + "STATE_MACHINE_ARN": "arn:aws:states:us-east-1:123:stateMachine:test", + "EVENT_BUS_NAME": "bus", + } + for k, v := range envVars { + t.Setenv(k, v) + } + + err := ilambda.ValidateEnv("stream-router") + require.NoError(t, err, "should pass when all env vars are set") +} diff --git a/cmd/lambda/watchdog/main.go b/cmd/lambda/watchdog/main.go index ba607ad..ce85adf 100644 --- a/cmd/lambda/watchdog/main.go +++ b/cmd/lambda/watchdog/main.go @@ -18,6 +18,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/sfn" ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/internal/lambda/watchdog" "github.com/dwsmith1983/interlock/internal/store" ) @@ -60,6 +61,6 @@ func main() { } lambda.Start(func(ctx context.Context) error { - return ilambda.HandleWatchdog(ctx, deps) + return watchdog.HandleWatchdog(ctx, deps) }) } diff --git a/cmd/lambda/watchdog/main_test.go b/cmd/lambda/watchdog/main_test.go new file mode 100644 index 0000000..919a118 --- /dev/null +++ b/cmd/lambda/watchdog/main_test.go @@ -0,0 +1,33 @@ +package main + +import ( + "testing" + + ilambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateEnv_MissingVars(t *testing.T) { + err := ilambda.ValidateEnv("watchdog") + assert.Error(t, err, "should report missing env vars") + assert.Contains(t, err.Error(), "CONTROL_TABLE") + assert.Contains(t, err.Error(), "JOBLOG_TABLE") + assert.Contains(t, err.Error(), "RERUN_TABLE") + assert.Contains(t, err.Error(), "EVENT_BUS_NAME") +} + +func TestValidateEnv_AllSet(t *testing.T) { + envVars := map[string]string{ + "CONTROL_TABLE": "ctl", + "JOBLOG_TABLE": "jl", + "RERUN_TABLE": "rr", + "EVENT_BUS_NAME": "bus", + } + for k, v := range envVars { + t.Setenv(k, v) + } + + err := ilambda.ValidateEnv("watchdog") + require.NoError(t, err, "should pass when all env vars are set") +} diff --git a/docs/content/docs/architecture/aws.md b/docs/content/docs/architecture/aws.md index 5d7052c..f0aabc0 100644 --- a/docs/content/docs/architecture/aws.md +++ b/docs/content/docs/architecture/aws.md @@ -127,11 +127,18 @@ When a Scheduler entry fires, it invokes this Lambda to publish the correspondin ### watchdog -Invoked by an EventBridge scheduled rule (default: every 5 minutes). Runs three independent scans: +Invoked by an EventBridge scheduled rule (default: every 5 minutes). Runs eight independent checks in a table-driven loop: 1. **Stale triggers** -- scans for `TRIGGER#` records with `RUNNING` status whose TTL has expired. Publishes `SFN_TIMEOUT` events and sets status to `FAILED_FINAL`. 2. **Missed schedules** -- loads all cron-scheduled pipeline configs, checks for missing `TRIGGER#` records for today's date. Publishes `SCHEDULE_MISSED` events for pipelines past their expected start time. -3. **Missing post-run sensors** -- for pipelines with `postRun` config and a completed trigger, checks whether post-run sensors have arrived within the `sensorTimeout` grace period. Publishes `POST_RUN_SENSOR_MISSING` events. +3. **Missed inclusion schedules** -- checks pipelines with inclusion calendar config for missing triggers on scheduled dates. Publishes `IRREGULAR_SCHEDULE_MISSED` events. +4. **Sensor-trigger reconciliation** -- re-evaluates trigger conditions for sensor-triggered pipelines. Self-heals missed triggers caused by silent completion-write failures. +5. **SLA scheduling** -- proactively creates EventBridge Scheduler entries for pipelines with SLA configs, ensuring warnings and breaches fire even when data never arrives. +6. **Trigger deadlines** -- evaluates trigger deadlines for sensor-triggered pipelines. Closes the auto-trigger window and publishes `SENSOR_DEADLINE_EXPIRED` when the deadline passes with no trigger. +7. **Missing post-run sensors** -- for pipelines with `postRun` config and a completed trigger, checks whether post-run sensors have arrived within the `sensorTimeout` grace period. Publishes `POST_RUN_SENSOR_MISSING` events. +8. **Relative SLA breaches** -- checks pipelines with `maxDuration` SLA config for breaches relative to the first sensor arrival time. + +If any check fails, the watchdog publishes a `WATCHDOG_DEGRADED` event listing the failed checks and returns an aggregate error. Individual check failures do not prevent the remaining checks from running. See [Watchdog](../watchdog) for the full algorithm. diff --git a/docs/content/docs/architecture/watchdog.md b/docs/content/docs/architecture/watchdog.md index 7bc5451..2b365b6 100644 --- a/docs/content/docs/architecture/watchdog.md +++ b/docs/content/docs/architecture/watchdog.md @@ -1,14 +1,19 @@ --- title: Watchdog weight: 3 -description: Detects stale trigger executions, missed cron schedules, and missing post-run sensors. +description: Detects stale triggers, missed schedules, SLA gaps, trigger deadline expiry, and missing post-run sensors. --- -The watchdog is one of four Lambda functions in the Interlock framework. It runs independently on an EventBridge schedule (default: every 5 minutes) and detects three classes of silent failures: +The watchdog is one of six Lambda functions in the Interlock framework. It runs independently on an EventBridge schedule (default: every 5 minutes) and runs eight checks in a table-driven loop to detect silent failures: 1. **Stale triggers** -- a Step Function execution started but never completed (timeout, infrastructure failure) 2. **Missed schedules** -- a cron-scheduled pipeline's expected start time passed with no trigger record -3. **Missing post-run sensors** -- a pipeline completed but the expected post-run sensor never arrived +3. **Missed inclusion schedules** -- a pipeline with an inclusion calendar has no trigger on a scheduled date +4. **Sensor-trigger reconciliation** -- a sensor-triggered pipeline's conditions are met but no trigger exists (self-heals missed triggers) +5. **SLA scheduling** -- proactively ensures EventBridge Scheduler entries exist for pipelines with SLA configs +6. **Trigger deadlines** -- a sensor-triggered pipeline's auto-trigger window has expired with no trigger +7. **Missing post-run sensors** -- a pipeline completed but the expected post-run sensor never arrived +8. **Relative SLA breaches** -- a pipeline with `maxDuration` SLA has exceeded its time budget since first sensor arrival In STAMP terms, these are safety constraint violations caused by _what didn't happen_ rather than what went wrong. @@ -195,7 +200,20 @@ variable "watchdog_schedule" { ## Error Handling -Both detection scans run independently. An error in stale trigger detection does not prevent missed schedule detection from running. Errors are logged but do not cause the Lambda invocation to fail, which prevents EventBridge from retrying with potentially stale state. +All eight checks run independently in sequence. An error in any check does not prevent the remaining checks from running. Errors are collected into an aggregate error and returned to the Lambda runtime. + +When one or more checks fail, the watchdog publishes a `WATCHDOG_DEGRADED` event to EventBridge listing the failed check names. This allows operators to detect partial watchdog failures through the standard alerting pipeline. + +```json +{ + "source": "interlock", + "detail-type": "WATCHDOG_DEGRADED", + "detail": { + "message": "watchdog checks failed: stale-triggers, sla-scheduling", + "timestamp": "2026-03-01T12:30:00Z" + } +} +``` ## Relationship to Step Functions diff --git a/internal/config/integration_test.go b/internal/config/integration_test.go index 45f2472..93b7d3c 100644 --- a/internal/config/integration_test.go +++ b/internal/config/integration_test.go @@ -5,7 +5,7 @@ import ( "time" "github.com/dwsmith1983/interlock/internal/config" - "github.com/dwsmith1983/interlock/internal/validation" + "github.com/dwsmith1983/interlock/pkg/validation" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) diff --git a/internal/lambda/alert/format.go b/internal/lambda/alert/format.go new file mode 100644 index 0000000..60aec38 --- /dev/null +++ b/internal/lambda/alert/format.go @@ -0,0 +1,62 @@ +package alert + +import ( + "fmt" + "strings" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// FormatAlertText builds the Slack message text from event detail. +func FormatAlertText(detailType string, detail types.InterlockEvent) string { + emoji := alertEmoji(detailType) + header := fmt.Sprintf("%s *%s* | %s | %s", emoji, detailType, detail.PipelineID, detail.Date) + + if len(detail.Detail) == 0 { + return header + "\n" + detail.Message + } + + var parts []string + if v, ok := detail.Detail["deadline"]; ok { + if breachAt, ok2 := detail.Detail["breachAt"]; ok2 { + parts = append(parts, fmt.Sprintf("Deadline %v (%v)", v, breachAt)) + } else { + parts = append(parts, fmt.Sprintf("Deadline %v", v)) + } + } + if v, ok := detail.Detail["status"]; ok { + parts = append(parts, fmt.Sprintf("Status: %v", v)) + } + if v, ok := detail.Detail["source"]; ok { + parts = append(parts, fmt.Sprintf("Source: %v", v)) + } + if v, ok := detail.Detail["cron"]; ok { + parts = append(parts, fmt.Sprintf("Cron: %v", v)) + } + + text := header + if len(parts) > 0 { + text += "\n" + strings.Join(parts, " · ") + } + if hint, ok := detail.Detail["actionHint"]; ok { + text += fmt.Sprintf("\n→ %v", hint) + } + return text +} + +func alertEmoji(detailType string) string { + switch detailType { + case string(types.EventSLABreach), string(types.EventJobFailed), + string(types.EventValidationExhausted), string(types.EventRetryExhausted), + string(types.EventInfraFailure), string(types.EventSFNTimeout), + string(types.EventScheduleMissed), string(types.EventDataDrift), + string(types.EventJobPollExhausted): + return "\xf0\x9f\x94\xb4" // red circle + case string(types.EventSLAWarning): + return "\xf0\x9f\x9f\xa1" // yellow circle + case string(types.EventSLAMet): + return "\xe2\x9c\x85" // check mark + default: + return "\xe2\x84\xb9\xef\xb8\x8f" // info + } +} diff --git a/internal/lambda/alert/handler.go b/internal/lambda/alert/handler.go new file mode 100644 index 0000000..a0bf86f --- /dev/null +++ b/internal/lambda/alert/handler.go @@ -0,0 +1,163 @@ +// Package alert implements the alert-dispatcher Lambda handler. +// It processes SQS messages containing EventBridge alert events +// and sends Slack notifications. +package alert + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/aws/aws-lambda-go/events" + "github.com/aws/aws-sdk-go-v2/service/dynamodb" + ddbtypes "github.com/aws/aws-sdk-go-v2/service/dynamodb/types" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// HandleAlertDispatcher processes SQS messages containing EventBridge alert events. +func HandleAlertDispatcher(ctx context.Context, d *lambda.Deps, sqsEvent events.SQSEvent) (events.SQSEventResponse, error) { + var failures []events.SQSBatchItemFailure + + for _, record := range sqsEvent.Records { + if err := processAlertMessage(ctx, d, record); err != nil { + d.Logger.WarnContext(ctx, "failed to process alert message", + "messageId", record.MessageId, "error", err) + failures = append(failures, events.SQSBatchItemFailure{ + ItemIdentifier: record.MessageId, + }) + } + } + + return events.SQSEventResponse{BatchItemFailures: failures}, nil +} + +func processAlertMessage(ctx context.Context, d *lambda.Deps, record events.SQSMessage) error { + var envelope lambda.EventBridgeInput + if err := json.Unmarshal([]byte(record.Body), &envelope); err != nil { + return fmt.Errorf("unmarshal EventBridge envelope: %w", err) + } + + var detail types.InterlockEvent + if err := json.Unmarshal(envelope.Detail, &detail); err != nil { + return fmt.Errorf("unmarshal event detail: %w", err) + } + + if d.SlackBotToken == "" { + d.Logger.InfoContext(ctx, "alert (no bot token configured)", + "eventType", envelope.DetailType, "pipeline", detail.PipelineID, + "date", detail.Date, "message", detail.Message) + return nil + } + + threadTs := getThreadTs(ctx, d, detail.PipelineID, detail.ScheduleID, detail.Date) + + text := FormatAlertText(envelope.DetailType, detail) + + type slackPayload struct { + Channel string `json:"channel"` + Blocks []map[string]interface{} `json:"blocks"` + ThreadTs string `json:"thread_ts,omitempty"` + } + + payload := slackPayload{ + Channel: d.SlackChannelID, + Blocks: []map[string]interface{}{ + { + "type": "section", + "text": map[string]string{ + "type": "mrkdwn", + "text": text, + }, + }, + }, + ThreadTs: threadTs, + } + + body, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshal slack payload: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, "https://slack.com/api/chat.postMessage", bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("create slack request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+d.SlackBotToken) + + resp, err := d.HTTPClient.Do(req) + if err != nil { + return fmt.Errorf("post to slack: %w", err) + } + defer func() { + _, _ = io.Copy(io.Discard, resp.Body) + _ = resp.Body.Close() + }() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("slack returned status %d", resp.StatusCode) + } + + var slackResp struct { + OK bool `json:"ok"` + TS string `json:"ts"` + Channel string `json:"channel"` + Error string `json:"error"` + } + if err := json.NewDecoder(resp.Body).Decode(&slackResp); err != nil { + return fmt.Errorf("decode slack response: %w", err) + } + if !slackResp.OK { + return fmt.Errorf("slack API error: %s", slackResp.Error) + } + + if threadTs == "" { + saveThreadTs(ctx, d, detail.PipelineID, detail.ScheduleID, detail.Date, slackResp.TS, d.SlackChannelID) + } + + d.Logger.InfoContext(ctx, "alert sent to Slack", + "eventType", envelope.DetailType, "pipeline", detail.PipelineID, "date", detail.Date) + return nil +} + +func getThreadTs(ctx context.Context, d *lambda.Deps, pipelineID, scheduleID, date string) string { + result, err := d.Store.Client.GetItem(ctx, &dynamodb.GetItemInput{ + TableName: &d.Store.EventsTable, + Key: map[string]ddbtypes.AttributeValue{ + "PK": &ddbtypes.AttributeValueMemberS{Value: types.PipelinePK(pipelineID)}, + "SK": &ddbtypes.AttributeValueMemberS{Value: fmt.Sprintf("THREAD#%s#%s", scheduleID, date)}, + }, + }) + if err != nil { + d.Logger.WarnContext(ctx, "thread lookup failed", "error", err) + return "" + } + if ts, ok := result.Item["threadTs"].(*ddbtypes.AttributeValueMemberS); ok { + return ts.Value + } + return "" +} + +func saveThreadTs(ctx context.Context, d *lambda.Deps, pipelineID, scheduleID, date, threadTs, channelID string) { + ttl := d.Now().Add(time.Duration(d.EventsTTLDays) * 24 * time.Hour).Unix() + _, err := d.Store.Client.PutItem(ctx, &dynamodb.PutItemInput{ + TableName: &d.Store.EventsTable, + Item: map[string]ddbtypes.AttributeValue{ + "PK": &ddbtypes.AttributeValueMemberS{Value: types.PipelinePK(pipelineID)}, + "SK": &ddbtypes.AttributeValueMemberS{Value: fmt.Sprintf("THREAD#%s#%s", scheduleID, date)}, + "threadTs": &ddbtypes.AttributeValueMemberS{Value: threadTs}, + "channelId": &ddbtypes.AttributeValueMemberS{Value: channelID}, + "createdAt": &ddbtypes.AttributeValueMemberS{Value: d.Now().UTC().Format(time.RFC3339)}, + "ttl": &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", ttl)}, + }, + }) + if err != nil { + d.Logger.WarnContext(ctx, "failed to save thread_ts", "error", err) + } +} diff --git a/internal/lambda/alert_dispatcher.go b/internal/lambda/alert_dispatcher.go index ce257de..67a9364 100644 --- a/internal/lambda/alert_dispatcher.go +++ b/internal/lambda/alert_dispatcher.go @@ -17,8 +17,7 @@ import ( "github.com/dwsmith1983/interlock/pkg/types" ) -// HandleAlertDispatcher processes SQS messages containing EventBridge alert events -// and sends Slack notifications. +// Deprecated: Use alert.HandleAlertDispatcher instead. Retained for test compatibility. func HandleAlertDispatcher(ctx context.Context, d *Deps, sqsEvent events.SQSEvent) (events.SQSEventResponse, error) { var failures []events.SQSBatchItemFailure @@ -149,7 +148,7 @@ func getThreadTs(ctx context.Context, d *Deps, pipelineID, scheduleID, date stri // saveThreadTs persists a Slack thread timestamp for future message threading. // Errors are logged but don't fail the message. func saveThreadTs(ctx context.Context, d *Deps, pipelineID, scheduleID, date, threadTs, channelID string) { - ttl := d.now().Add(time.Duration(d.EventsTTLDays) * 24 * time.Hour).Unix() + ttl := d.Now().Add(time.Duration(d.EventsTTLDays) * 24 * time.Hour).Unix() _, err := d.Store.Client.PutItem(ctx, &dynamodb.PutItemInput{ TableName: &d.Store.EventsTable, Item: map[string]ddbtypes.AttributeValue{ @@ -157,7 +156,7 @@ func saveThreadTs(ctx context.Context, d *Deps, pipelineID, scheduleID, date, th "SK": &ddbtypes.AttributeValueMemberS{Value: fmt.Sprintf("THREAD#%s#%s", scheduleID, date)}, "threadTs": &ddbtypes.AttributeValueMemberS{Value: threadTs}, "channelId": &ddbtypes.AttributeValueMemberS{Value: channelID}, - "createdAt": &ddbtypes.AttributeValueMemberS{Value: d.now().UTC().Format(time.RFC3339)}, + "createdAt": &ddbtypes.AttributeValueMemberS{Value: d.Now().UTC().Format(time.RFC3339)}, "ttl": &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", ttl)}, }, }) @@ -166,7 +165,7 @@ func saveThreadTs(ctx context.Context, d *Deps, pipelineID, scheduleID, date, th } } -// FormatAlertText builds the Slack message text from event detail. +// Deprecated: Use alert.FormatAlertText instead. Retained for test compatibility. func FormatAlertText(detailType string, detail types.InterlockEvent) string { emoji := alertEmoji(detailType) header := fmt.Sprintf("%s *%s* | %s | %s", emoji, detailType, detail.PipelineID, detail.Date) diff --git a/internal/lambda/config.go b/internal/lambda/config.go new file mode 100644 index 0000000..85d70d8 --- /dev/null +++ b/internal/lambda/config.go @@ -0,0 +1,29 @@ +package lambda + +import ( + "context" + + "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" +) + +// GetValidatedConfig loads a pipeline config and validates its retry/timeout +// fields. Returns nil (with a warning log) if validation fails, signalling the +// caller to skip processing for this pipeline. +func GetValidatedConfig(ctx context.Context, d *Deps, pipelineID string) (*types.PipelineConfig, error) { + cfg, err := d.ConfigCache.Get(ctx, pipelineID) + if err != nil { + return nil, err + } + if cfg == nil { + return nil, nil + } + if errs := validation.ValidatePipelineConfig(cfg); len(errs) > 0 { + d.Logger.Warn("invalid pipeline config, skipping", + "pipelineId", pipelineID, + "errors", errs, + ) + return nil, nil + } + return cfg, nil +} diff --git a/internal/lambda/date.go b/internal/lambda/date.go new file mode 100644 index 0000000..51cbb7c --- /dev/null +++ b/internal/lambda/date.go @@ -0,0 +1,116 @@ +package lambda + +import ( + "sort" + "strconv" + "strings" + "time" +) + +// ResolveExecutionDate builds the execution date from sensor data fields. +// If both "date" and "hour" are present, returns "YYYY-MM-DDThh". +// If only "date", returns "YYYY-MM-DD". Falls back to today's date. +func ResolveExecutionDate(sensorData map[string]interface{}, now time.Time) string { + dateStr, _ := sensorData["date"].(string) + hourStr, _ := sensorData["hour"].(string) + + if dateStr == "" { + return now.Format("2006-01-02") + } + + normalized := normalizeDate(dateStr) + // Validate YYYY-MM-DD format. + if _, err := time.Parse("2006-01-02", normalized); err != nil { + return now.Format("2006-01-02") + } + + if hourStr != "" { + // Validate hour is 2-digit 00-23. + if len(hourStr) == 2 { + if h, err := strconv.Atoi(hourStr); err == nil && h >= 0 && h <= 23 { + return normalized + "T" + hourStr + } + } + return normalized + } + return normalized +} + +// normalizeDate converts YYYYMMDD to YYYY-MM-DD. Already-dashed dates pass through. +func normalizeDate(s string) string { + if len(s) == 8 && !strings.Contains(s, "-") { + return s[:4] + "-" + s[4:6] + "-" + s[6:8] + } + return s +} + +// ParseExecutionDate splits a composite date into date and hour parts. +// "2026-03-03T10" -> ("2026-03-03", "10") +// "2026-03-03" -> ("2026-03-03", "") +func ParseExecutionDate(date string) (datePart, hourPart string) { + if idx := strings.Index(date, "T"); idx >= 0 { + return date[:idx], date[idx+1:] + } + return date, "" +} + +// ResolveTimezone loads the time.Location for the given timezone name. +// Returns time.UTC if tz is empty or cannot be loaded. +func ResolveTimezone(tz string) *time.Location { + if tz == "" { + return time.UTC + } + if loc, err := time.LoadLocation(tz); err == nil { + return loc + } + return time.UTC +} + +// MostRecentInclusionDate returns the most recent date from dates that is on +// or before now (comparing date only, ignoring time of day). Dates must be +// YYYY-MM-DD strings; unparseable entries are silently skipped. Returns +// ("", false) if no dates qualify. +func MostRecentInclusionDate(dates []string, now time.Time) (string, bool) { + nowDate := now.Format("2006-01-02") + best := "" + found := false + for _, d := range dates { + if _, err := time.Parse("2006-01-02", d); err != nil { + continue + } + if d <= nowDate && d > best { + best = d + found = true + } + } + return best, found +} + +// maxInclusionLookback is the maximum number of past inclusion dates to check. +// Caps DynamoDB reads when the watchdog has been down for an extended period. +const maxInclusionLookback = 3 + +// PastInclusionDates returns dates from the list that are on or before now, +// sorted most recent first and capped at maxInclusionLookback (3) entries. +// The cap bounds DynamoDB reads when the watchdog has been down for an +// extended period. Dates must be YYYY-MM-DD strings; unparseable entries +// are silently skipped. Returns nil if no dates qualify. +func PastInclusionDates(dates []string, now time.Time) []string { + nowDate := now.Format("2006-01-02") + var past []string + for _, d := range dates { + if _, err := time.Parse("2006-01-02", d); err != nil { + continue + } + if d <= nowDate { + past = append(past, d) + } + } + // Sort descending (most recent first) using string comparison on YYYY-MM-DD. + sort.Sort(sort.Reverse(sort.StringSlice(past))) + // Cap to maxInclusionLookback to bound downstream DynamoDB reads. + if len(past) > maxInclusionLookback { + past = past[:maxInclusionLookback] + } + return past +} diff --git a/internal/lambda/deps.go b/internal/lambda/deps.go index 43d5b5a..2704423 100644 --- a/internal/lambda/deps.go +++ b/internal/lambda/deps.go @@ -30,8 +30,8 @@ type Deps struct { Logger *slog.Logger } -// now returns the current time using NowFunc if set, otherwise time.Now. -func (d *Deps) now() time.Time { +// Now returns the current time using NowFunc if set, otherwise time.Now. +func (d *Deps) Now() time.Time { if d.NowFunc != nil { return d.NowFunc() } diff --git a/internal/lambda/dryrun.go b/internal/lambda/dryrun.go index eff6514..3e2c03d 100644 --- a/internal/lambda/dryrun.go +++ b/internal/lambda/dryrun.go @@ -7,8 +7,8 @@ import ( "strings" "time" - "github.com/dwsmith1983/interlock/internal/validation" "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" ) // handleDryRunTrigger processes a sensor event for a dry-run pipeline. @@ -37,7 +37,7 @@ func handleDryRunTrigger(ctx context.Context, d *Deps, cfg *types.PipelineConfig } lateBy := now.Sub(triggeredAt) - if pubErr := publishEvent(ctx, d, string(types.EventDryRunLateData), pipelineID, scheduleID, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunLateData), pipelineID, scheduleID, date, fmt.Sprintf("dry-run: late data arrived %.0fm after trigger point for %s", lateBy.Minutes(), pipelineID), map[string]interface{}{ "triggeredAt": triggeredAtStr, @@ -83,7 +83,7 @@ func handleDryRunTrigger(ctx context.Context, d *Deps, cfg *types.PipelineConfig } // Publish WOULD_TRIGGER event. - if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldTrigger), pipelineID, scheduleID, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunWouldTrigger), pipelineID, scheduleID, date, fmt.Sprintf("dry-run: would trigger %s at %s", pipelineID, now.Format(time.RFC3339)), map[string]interface{}{ "triggeredAt": now.UTC().Format(time.RFC3339), @@ -112,7 +112,7 @@ func handleDryRunTrigger(ctx context.Context, d *Deps, cfg *types.PipelineConfig completedDetail["slaStatus"] = "n/a" } - if pubErr := publishEvent(ctx, d, string(types.EventDryRunCompleted), pipelineID, scheduleID, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunCompleted), pipelineID, scheduleID, date, fmt.Sprintf("dry-run: observation complete for %s/%s", pipelineID, date), completedDetail); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunCompleted, "error", pubErr) @@ -196,7 +196,7 @@ func publishDryRunSLAProjection(ctx context.Context, d *Deps, cfg *types.Pipelin detail["status"] = verdict.Status - if pubErr := publishEvent(ctx, d, string(types.EventDryRunSLAProjection), pipelineID, scheduleID, date, message, detail); pubErr != nil { + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunSLAProjection), pipelineID, scheduleID, date, message, detail); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunSLAProjection, "error", pubErr) } @@ -206,8 +206,8 @@ func publishDryRunSLAProjection(ctx context.Context, d *Deps, cfg *types.Pipelin // handleDryRunPostRunSensor handles post-run sensor events for dry-run pipelines. // Compares sensor data against the baseline captured at WOULD_TRIGGER time. func handleDryRunPostRunSensor(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, sensorKey string, sensorData map[string]interface{}) error { - scheduleID := resolveScheduleID(cfg) - date := ResolveExecutionDate(sensorData, d.now()) + scheduleID := ResolveScheduleID(cfg) + date := ResolveExecutionDate(sensorData, d.Now()) // Check DRY_RUN# marker — if nil, no trigger happened yet. marker, err := d.Store.GetDryRunMarker(ctx, pipelineID, scheduleID, date) @@ -243,14 +243,14 @@ func handleDryRunPostRunSensor(ctx context.Context, d *Deps, cfg *types.Pipeline } // Compare drift. - driftField := resolveDriftField(cfg.PostRun) + driftField := ResolveDriftField(cfg.PostRun) threshold := 0.0 if cfg.PostRun.DriftThreshold != nil { threshold = *cfg.PostRun.DriftThreshold } dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) if dr.Drifted { - if pubErr := publishEvent(ctx, d, string(types.EventDryRunDrift), pipelineID, scheduleID, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunDrift), pipelineID, scheduleID, date, fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, dr.Previous, dr.Current), map[string]interface{}{ "previousCount": dr.Previous, diff --git a/internal/lambda/dynstream.go b/internal/lambda/dynstream.go deleted file mode 100644 index 5d45ae1..0000000 --- a/internal/lambda/dynstream.go +++ /dev/null @@ -1,301 +0,0 @@ -package lambda - -import ( - "context" - "encoding/json" - "fmt" - "sort" - "strconv" - "strings" - "time" - - "github.com/aws/aws-lambda-go/events" - "github.com/aws/aws-sdk-go-v2/service/eventbridge" - ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types" - "github.com/dwsmith1983/interlock/pkg/types" -) - -// extractKeys returns the PK and SK string values from a DynamoDB stream record. -func extractKeys(record events.DynamoDBEventRecord) (pk, sk string) { - keys := record.Change.Keys - if pkAttr, ok := keys["PK"]; ok && pkAttr.DataType() == events.DataTypeString { - pk = pkAttr.String() - } - if skAttr, ok := keys["SK"]; ok && skAttr.DataType() == events.DataTypeString { - sk = skAttr.String() - } - return pk, sk -} - -// extractSensorData converts a DynamoDB stream NewImage to a plain map -// suitable for validation rule evaluation. If the item uses the canonical -// ControlRecord format (sensor fields nested inside a "data" map attribute), -// the "data" map is unwrapped so fields are accessible at the top level. -func extractSensorData(newImage map[string]events.DynamoDBAttributeValue) map[string]interface{} { - if newImage == nil { - return nil - } - - skipKeys := map[string]bool{"PK": true, "SK": true, "ttl": true} - result := make(map[string]interface{}, len(newImage)) - - for k, av := range newImage { - if skipKeys[k] { - continue - } - result[k] = convertAttributeValue(av) - } - - // Unwrap the "data" map if present (canonical ControlRecord sensor format). - if dataMap, ok := result["data"].(map[string]interface{}); ok { - return dataMap - } - return result -} - -// convertAttributeValue converts a DynamoDB stream attribute value to a Go native type. -func convertAttributeValue(av events.DynamoDBAttributeValue) interface{} { - switch av.DataType() { - case events.DataTypeString: - return av.String() - case events.DataTypeNumber: - // Try int first, fall back to float. - if i, err := strconv.ParseInt(av.Number(), 10, 64); err == nil { - return float64(i) - } - if f, err := strconv.ParseFloat(av.Number(), 64); err == nil { - return f - } - return av.Number() - case events.DataTypeBoolean: - return av.Boolean() - case events.DataTypeNull: - return nil - case events.DataTypeMap: - m := av.Map() - out := make(map[string]interface{}, len(m)) - for k, v := range m { - out[k] = convertAttributeValue(v) - } - return out - case events.DataTypeList: - l := av.List() - out := make([]interface{}, len(l)) - for i, v := range l { - out[i] = convertAttributeValue(v) - } - return out - default: - return nil - } -} - -// ResolveExecutionDate builds the execution date from sensor data fields. -// If both "date" and "hour" are present, returns "YYYY-MM-DDThh". -// If only "date", returns "YYYY-MM-DD". Falls back to today's date. -func ResolveExecutionDate(sensorData map[string]interface{}, now time.Time) string { - dateStr, _ := sensorData["date"].(string) - hourStr, _ := sensorData["hour"].(string) - - if dateStr == "" { - return now.Format("2006-01-02") - } - - normalized := normalizeDate(dateStr) - // Validate YYYY-MM-DD format. - if _, err := time.Parse("2006-01-02", normalized); err != nil { - return now.Format("2006-01-02") - } - - if hourStr != "" { - // Validate hour is 2-digit 00-23. - if len(hourStr) == 2 { - if h, err := strconv.Atoi(hourStr); err == nil && h >= 0 && h <= 23 { - return normalized + "T" + hourStr - } - } - return normalized - } - return normalized -} - -// normalizeDate converts YYYYMMDD to YYYY-MM-DD. Already-dashed dates pass through. -func normalizeDate(s string) string { - if len(s) == 8 && !strings.Contains(s, "-") { - return s[:4] + "-" + s[4:6] + "-" + s[6:8] - } - return s -} - -// resolveScheduleID returns "cron" if the pipeline uses a cron schedule, -// otherwise returns "stream". -func resolveScheduleID(cfg *types.PipelineConfig) string { - if cfg.Schedule.Cron != "" { - return "cron" - } - return "stream" -} - -// publishEvent sends an event to EventBridge. It is safe to call when -// EventBridge is nil or EventBusName is empty (returns nil with no action). -func publishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule, date, message string, detail ...map[string]interface{}) error { - if d.EventBridge == nil || d.EventBusName == "" { - return nil - } - - evt := types.InterlockEvent{ - PipelineID: pipelineID, - ScheduleID: schedule, - Date: date, - Message: message, - Timestamp: d.now(), - } - if len(detail) > 0 && detail[0] != nil { - evt.Detail = detail[0] - } - detailJSON, err := json.Marshal(evt) - if err != nil { - return fmt.Errorf("marshal event detail: %w", err) - } - - source := types.EventSource - detailStr := string(detailJSON) - - out, err := d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{ - Entries: []ebTypes.PutEventsRequestEntry{ - { - Source: &source, - DetailType: &eventType, - Detail: &detailStr, - EventBusName: &d.EventBusName, - }, - }, - }) - if err != nil { - return fmt.Errorf("publish %s event: %w", eventType, err) - } - if out.FailedEntryCount > 0 { - code, msg := "", "" - if len(out.Entries) > 0 && out.Entries[0].ErrorCode != nil { - code = *out.Entries[0].ErrorCode - if out.Entries[0].ErrorMessage != nil { - msg = *out.Entries[0].ErrorMessage - } - } - return fmt.Errorf("publish %s event: partial failure (code=%s, message=%s)", eventType, code, msg) - } - return nil -} - -// resolveTimezone loads the time.Location for the given timezone name. -// Returns time.UTC if tz is empty or cannot be loaded. -func resolveTimezone(tz string) *time.Location { - if tz == "" { - return time.UTC - } - if loc, err := time.LoadLocation(tz); err == nil { - return loc - } - return time.UTC -} - -// MostRecentInclusionDate returns the most recent date from dates that is on -// or before now (comparing date only, ignoring time of day). Dates must be -// YYYY-MM-DD strings; unparseable entries are silently skipped. Returns -// ("", false) if no dates qualify. -func MostRecentInclusionDate(dates []string, now time.Time) (string, bool) { - nowDate := now.Format("2006-01-02") - best := "" - found := false - for _, d := range dates { - if _, err := time.Parse("2006-01-02", d); err != nil { - continue - } - if d <= nowDate && d > best { - best = d - found = true - } - } - return best, found -} - -// maxInclusionLookback is the maximum number of past inclusion dates to check. -// Caps DynamoDB reads when the watchdog has been down for an extended period. -const maxInclusionLookback = 3 - -// PastInclusionDates returns dates from the list that are on or before now, -// sorted most recent first and capped at maxInclusionLookback (3) entries. -// The cap bounds DynamoDB reads when the watchdog has been down for an -// extended period. Dates must be YYYY-MM-DD strings; unparseable entries -// are silently skipped. Returns nil if no dates qualify. -func PastInclusionDates(dates []string, now time.Time) []string { - nowDate := now.Format("2006-01-02") - var past []string - for _, d := range dates { - if _, err := time.Parse("2006-01-02", d); err != nil { - continue - } - if d <= nowDate { - past = append(past, d) - } - } - // Sort descending (most recent first) using string comparison on YYYY-MM-DD. - sort.Sort(sort.Reverse(sort.StringSlice(past))) - // Cap to maxInclusionLookback to bound downstream DynamoDB reads. - if len(past) > maxInclusionLookback { - past = past[:maxInclusionLookback] - } - return past -} - -// isExcludedTime is the core calendar exclusion check. It evaluates -// whether the given time falls on a weekend or a specifically excluded date. -func isExcludedTime(excl *types.ExclusionConfig, t time.Time) bool { - if excl == nil { - return false - } - if excl.Weekends { - day := t.Weekday() - if day == time.Saturday || day == time.Sunday { - return true - } - } - dateStr := t.Format("2006-01-02") - for _, d := range excl.Dates { - if d == dateStr { - return true - } - } - return false -} - -// isExcludedDate checks calendar exclusions against a job's execution date -// (not wall-clock time). dateStr supports "YYYY-MM-DD" and "YYYY-MM-DDTHH". -func isExcludedDate(cfg *types.PipelineConfig, dateStr string) bool { - excl := cfg.Schedule.Exclude - if excl == nil || len(dateStr) < 10 { - return false - } - loc := resolveTimezone(cfg.Schedule.Timezone) - t, err := time.ParseInLocation("2006-01-02", dateStr[:10], loc) - if err != nil { - return false - } - return isExcludedTime(excl, t) -} - -// isExcluded checks whether the pipeline should be excluded from running -// based on calendar exclusions (weekends and specific dates). -// When no timezone is configured, now is used as-is (preserving its -// original location, which is UTC in AWS Lambda). -func isExcluded(cfg *types.PipelineConfig, now time.Time) bool { - excl := cfg.Schedule.Exclude - if excl == nil { - return false - } - t := now - if cfg.Schedule.Timezone != "" { - t = now.In(resolveTimezone(cfg.Schedule.Timezone)) - } - return isExcludedTime(excl, t) -} diff --git a/internal/lambda/dynstream_test.go b/internal/lambda/dynstream_test.go index 73ebeaa..45a0743 100644 --- a/internal/lambda/dynstream_test.go +++ b/internal/lambda/dynstream_test.go @@ -36,7 +36,7 @@ func TestPublishEvent_PartialFailure(t *testing.T) { NowFunc: func() time.Time { return time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) }, } - err := publishEvent(context.Background(), d, "test.event", "pipeline-1", "cron", "2025-01-01", "test message") + err := PublishEvent(context.Background(), d, "test.event", "pipeline-1", "cron", "2025-01-01", "test message") if err == nil { t.Fatal("expected error for partial failure, got nil") } diff --git a/internal/lambda/event_sink.go b/internal/lambda/event_sink.go index 90ae99e..9c2d3aa 100644 --- a/internal/lambda/event_sink.go +++ b/internal/lambda/event_sink.go @@ -11,7 +11,7 @@ import ( "github.com/dwsmith1983/interlock/pkg/types" ) -// HandleEventSink writes an EventBridge event to the centralized events table. +// Deprecated: Use sink.HandleEventSink instead. Retained for test compatibility. func HandleEventSink(ctx context.Context, d *Deps, input EventBridgeInput) error { var detail types.InterlockEvent if err := json.Unmarshal(input.Detail, &detail); err != nil { @@ -23,10 +23,10 @@ func HandleEventSink(ctx context.Context, d *Deps, input EventBridgeInput) error if !detail.Timestamp.IsZero() { tsMillis = detail.Timestamp.UnixMilli() } else { - tsMillis = d.now().UnixMilli() + tsMillis = d.Now().UnixMilli() } - now := d.now() + now := d.Now() ttlDays := d.EventsTTLDays if ttlDays <= 0 { ttlDays = 90 diff --git a/internal/lambda/exclusion.go b/internal/lambda/exclusion.go new file mode 100644 index 0000000..5b2c64f --- /dev/null +++ b/internal/lambda/exclusion.go @@ -0,0 +1,59 @@ +package lambda + +import ( + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// IsExcludedTime is the core calendar exclusion check. It evaluates +// whether the given time falls on a weekend or a specifically excluded date. +func IsExcludedTime(excl *types.ExclusionConfig, t time.Time) bool { + if excl == nil { + return false + } + if excl.Weekends { + day := t.Weekday() + if day == time.Saturday || day == time.Sunday { + return true + } + } + dateStr := t.Format("2006-01-02") + for _, d := range excl.Dates { + if d == dateStr { + return true + } + } + return false +} + +// IsExcludedDate checks calendar exclusions against a job's execution date +// (not wall-clock time). dateStr supports "YYYY-MM-DD" and "YYYY-MM-DDTHH". +func IsExcludedDate(cfg *types.PipelineConfig, dateStr string) bool { + excl := cfg.Schedule.Exclude + if excl == nil || len(dateStr) < 10 { + return false + } + loc := ResolveTimezone(cfg.Schedule.Timezone) + t, err := time.ParseInLocation("2006-01-02", dateStr[:10], loc) + if err != nil { + return false + } + return IsExcludedTime(excl, t) +} + +// IsExcluded checks whether the pipeline should be excluded from running +// based on calendar exclusions (weekends and specific dates). +// When no timezone is configured, now is used as-is (preserving its +// original location, which is UTC in AWS Lambda). +func IsExcluded(cfg *types.PipelineConfig, now time.Time) bool { + excl := cfg.Schedule.Exclude + if excl == nil { + return false + } + t := now + if cfg.Schedule.Timezone != "" { + t = now.In(ResolveTimezone(cfg.Schedule.Timezone)) + } + return IsExcludedTime(excl, t) +} diff --git a/internal/lambda/export_test.go b/internal/lambda/export_test.go index 3c5f829..d1f3ead 100644 --- a/internal/lambda/export_test.go +++ b/internal/lambda/export_test.go @@ -5,14 +5,8 @@ package lambda import ( "time" - - "github.com/dwsmith1983/interlock/pkg/types" ) -// IsExcludedDate re-exports isExcludedDate for white-box unit testing from -// the external test package (package lambda_test). -var IsExcludedDate func(cfg *types.PipelineConfig, dateStr string) bool = isExcludedDate - -// ResolveTriggerDeadlineTime re-exports resolveTriggerDeadlineTime for +// ExportedResolveTriggerDeadlineTime re-exports resolveTriggerDeadlineTime for // white-box unit testing from the external test package (package lambda_test). -var ResolveTriggerDeadlineTime func(deadline, date, timezone string) time.Time = resolveTriggerDeadlineTime +var ExportedResolveTriggerDeadlineTime func(deadline, date, timezone string) time.Time = resolveTriggerDeadlineTime diff --git a/internal/lambda/orchestrator.go b/internal/lambda/orchestrator.go index 3035d90..5832c28 100644 --- a/internal/lambda/orchestrator.go +++ b/internal/lambda/orchestrator.go @@ -8,12 +8,14 @@ import ( "strings" "github.com/dwsmith1983/interlock/internal/store" - "github.com/dwsmith1983/interlock/internal/validation" "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" ) // HandleOrchestrator is the entry point for the orchestrator Lambda. -// It dispatches to one of five modes: evaluate, trigger, check-job, post-run, validation-exhausted. +// Retained for backward compatibility with existing tests. +// +// Deprecated: Production callers should use orchestrator.HandleOrchestrator. func HandleOrchestrator(ctx context.Context, d *Deps, input OrchestratorInput) (OrchestratorOutput, error) { switch input.Mode { case "evaluate": @@ -53,10 +55,10 @@ func handleEvaluate(ctx context.Context, d *Deps, input OrchestratorInput) (Orch RemapPerPeriodSensors(sensors, input.Date) - result := validation.EvaluateRules(cfg.Validation.Trigger, cfg.Validation.Rules, sensors, d.now()) + result := validation.EvaluateRules(cfg.Validation.Trigger, cfg.Validation.Rules, sensors, d.Now()) if result.Passed { - if err := publishEvent(ctx, d, string(types.EventValidationPassed), input.PipelineID, input.ScheduleID, input.Date, "all validation rules passed"); err != nil { + if err := PublishEvent(ctx, d, string(types.EventValidationPassed), input.PipelineID, input.ScheduleID, input.Date, "all validation rules passed"); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventValidationPassed, "error", err) } } @@ -93,8 +95,6 @@ func handleTrigger(ctx context.Context, d *Deps, input OrchestratorInput) (Orche metadata, err := d.TriggerRunner.Execute(ctx, &triggerCfg) if err != nil { errMsg := fmt.Sprintf("trigger execute: %v", err) - // Log infra failure to joblog for audit trail, then return Lambda error - // so Step Functions Retry handles exponential backoff. if writeErr := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventInfraTriggerFailure, "", 0, errMsg); writeErr != nil { d.Logger.WarnContext(ctx, "failed to write infra trigger failure to joblog", "error", writeErr, "pipeline", input.PipelineID) } @@ -103,13 +103,10 @@ func handleTrigger(ctx context.Context, d *Deps, input OrchestratorInput) (Orche runID := extractRunID(metadata) - if err := publishEvent(ctx, d, string(types.EventJobTriggered), input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("triggered %s job", cfg.Job.Type)); err != nil { + if err := PublishEvent(ctx, d, string(types.EventJobTriggered), input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("triggered %s job", cfg.Job.Type)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobTriggered, "error", err) } - // Non-polling triggers (http, command, lambda) complete synchronously - // during Execute. Write success to joblog immediately and set a sentinel - // runId so the Step Functions CheckJob JSONPath resolves. if metadata == nil { if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventSuccess, "sync", 0, fmt.Sprintf("%s trigger completed synchronously", cfg.Job.Type)); err != nil { @@ -127,9 +124,7 @@ func handleTrigger(ctx context.Context, d *Deps, input OrchestratorInput) (Orche }, nil } -// handleCheckJob queries the job log for the latest event. If no event exists -// and a StatusChecker is configured, it polls the trigger API directly and -// writes terminal results (succeeded/failed) to the job log. +// handleCheckJob queries the job log for the latest event. func handleCheckJob(ctx context.Context, d *Deps, input OrchestratorInput) (OrchestratorOutput, error) { record, err := d.Store.GetLatestJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date) if err != nil { @@ -137,8 +132,6 @@ func handleCheckJob(ctx context.Context, d *Deps, input OrchestratorInput) (Orch } if record != nil { - // Only return terminal events; skip intermediate events like - // infra-trigger-failure so the StatusChecker can poll actual job status. switch record.Event { case types.JobEventSuccess, types.JobEventFail, types.JobEventTimeout: return OrchestratorOutput{ @@ -148,7 +141,6 @@ func handleCheckJob(ctx context.Context, d *Deps, input OrchestratorInput) (Orch } } - // No terminal joblog entry — try polling the trigger API directly. if d.StatusChecker == nil || len(input.Metadata) == 0 { return OrchestratorOutput{Mode: "check-job"}, nil } @@ -172,9 +164,6 @@ func handleCheckJob(ctx context.Context, d *Deps, input OrchestratorInput) (Orch if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventSuccess, input.RunID, 0, ""); err != nil { d.Logger.Warn("failed to write polled job success joblog", "error", err, "pipeline", input.PipelineID, "schedule", input.ScheduleID, "date", input.Date) } - // JOB_COMPLETED is published by the stream-router when the JOB# - // record arrives via DynamoDB stream (handleJobSuccess). Publishing - // here as well would cause duplicate alerts for polled jobs. return OrchestratorOutput{Mode: "check-job", Event: "success"}, nil case "failed": var writeOpts []store.JobEventOption @@ -184,12 +173,11 @@ func handleCheckJob(ctx context.Context, d *Deps, input OrchestratorInput) (Orch if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventFail, input.RunID, 0, result.Message, writeOpts...); err != nil { d.Logger.Warn("failed to write polled job failure joblog", "error", err, "pipeline", input.PipelineID, "schedule", input.ScheduleID, "date", input.Date) } - if err := publishEvent(ctx, d, string(types.EventJobFailed), input.PipelineID, input.ScheduleID, input.Date, "job failed: "+result.Message); err != nil { + if err := PublishEvent(ctx, d, string(types.EventJobFailed), input.PipelineID, input.ScheduleID, input.Date, "job failed: "+result.Message); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobFailed, "error", err) } return OrchestratorOutput{Mode: "check-job", Event: "fail"}, nil default: - // Still running — return no event so SFN loops back to WaitForJob. return OrchestratorOutput{Mode: "check-job"}, nil } } @@ -219,7 +207,7 @@ func handleValidationExhausted(ctx context.Context, d *Deps, input OrchestratorI return OrchestratorOutput{}, fmt.Errorf("write validation-exhausted joblog: %w", err) } - if err := publishEvent(ctx, d, string(types.EventValidationExhausted), input.PipelineID, input.ScheduleID, input.Date, "evaluation window exhausted without passing"); err != nil { + if err := PublishEvent(ctx, d, string(types.EventValidationExhausted), input.PipelineID, input.ScheduleID, input.Date, "evaluation window exhausted without passing"); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventValidationExhausted, "error", err) } @@ -237,7 +225,7 @@ func handleJobPollExhausted(ctx context.Context, d *Deps, input OrchestratorInpu return OrchestratorOutput{}, fmt.Errorf("write job-poll-exhausted joblog: %w", err) } - if err := publishEvent(ctx, d, string(types.EventJobPollExhausted), input.PipelineID, input.ScheduleID, input.Date, + if err := PublishEvent(ctx, d, string(types.EventJobPollExhausted), input.PipelineID, input.ScheduleID, input.Date, "job poll window exhausted"); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobPollExhausted, "error", err) } @@ -248,36 +236,6 @@ func handleJobPollExhausted(ctx context.Context, d *Deps, input OrchestratorInpu }, nil } -// RemapPerPeriodSensors adds base-key aliases for per-period sensor keys. -// For example, sensor "hourly-status#20260303T07" becomes accessible under -// key "hourly-status" when the execution date is "2026-03-03T07". This allows -// validation rules with key "hourly-status" to match per-period sensor records. -// Handles both normalized (2026-03-03) and compact (20260303) date formats. -func RemapPerPeriodSensors(sensors map[string]map[string]interface{}, date string) { - if date == "" { - return - } - // Build candidate suffixes: the normalized date and compact form. - suffixes := []string{"#" + date} - compact := strings.ReplaceAll(date, "-", "") - if compact != date { - suffixes = append(suffixes, "#"+compact) - } - additions := make(map[string]map[string]interface{}) - for key, data := range sensors { - for _, suffix := range suffixes { - if strings.HasSuffix(key, suffix) { - base := strings.TrimSuffix(key, suffix) - additions[base] = data - break - } - } - } - for k, v := range additions { - sensors[k] = v - } -} - // handleTriggerExhausted publishes RETRY_EXHAUSTED when trigger retries are // exhausted, writes a joblog entry for audit, and releases the trigger lock // so the pipeline can be re-triggered. @@ -287,18 +245,16 @@ func handleTriggerExhausted(ctx context.Context, d *Deps, input OrchestratorInpu errMsg = cause } - // Dual-write: joblog entry (audit) + EventBridge event (alerting). if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventInfraTriggerExhausted, "", 0, errMsg); err != nil { return OrchestratorOutput{}, fmt.Errorf("write trigger-exhausted joblog: %w", err) } - if err := publishEvent(ctx, d, string(types.EventRetryExhausted), input.PipelineID, input.ScheduleID, input.Date, + if err := PublishEvent(ctx, d, string(types.EventRetryExhausted), input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("trigger retries exhausted for %s: %s", input.PipelineID, errMsg)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRetryExhausted, "error", err) } - // Release lock so pipeline can be re-triggered. if err := d.Store.ReleaseTriggerLock(ctx, input.PipelineID, input.ScheduleID, input.Date); err != nil { d.Logger.WarnContext(ctx, "failed to release trigger lock after exhaustion", "pipeline", input.PipelineID, "error", err) @@ -311,9 +267,6 @@ func handleTriggerExhausted(ctx context.Context, d *Deps, input OrchestratorInpu } // handleCompleteTrigger sets the trigger row to its terminal status. -// Success → COMPLETED; fail/timeout → FAILED_FINAL. -// On success with PostRun configured, captures a date-scoped baseline snapshot -// of all sensors for later drift comparison by the stream-based post-run evaluator. func handleCompleteTrigger(ctx context.Context, d *Deps, input OrchestratorInput) (OrchestratorOutput, error) { status := types.TriggerStatusCompleted if input.Event != types.JobEventSuccess { @@ -324,12 +277,11 @@ func handleCompleteTrigger(ctx context.Context, d *Deps, input OrchestratorInput return OrchestratorOutput{}, fmt.Errorf("set trigger status: %w", err) } - // On success, capture post-run baseline for drift detection. if input.Event == types.JobEventSuccess { if err := capturePostRunBaseline(ctx, d, input.PipelineID, input.ScheduleID, input.Date); err != nil { d.Logger.WarnContext(ctx, "failed to capture post-run baseline", "pipeline", input.PipelineID, "error", err) - if pubErr := publishEvent(ctx, d, string(types.EventBaselineCaptureFailed), input.PipelineID, input.ScheduleID, input.Date, + if pubErr := PublishEvent(ctx, d, string(types.EventBaselineCaptureFailed), input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("baseline capture failed for %s: %v", input.PipelineID, err)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish baseline capture failure event", "error", pubErr) } @@ -343,8 +295,7 @@ func handleCompleteTrigger(ctx context.Context, d *Deps, input OrchestratorInput } // capturePostRunBaseline reads all sensors and writes a date-scoped baseline -// snapshot if the pipeline has PostRun config. The baseline is stored as -// "postrun-baseline#" so drift detection can compare against it. +// snapshot if the pipeline has PostRun config. func capturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) error { cfg, err := d.Store.GetConfig(ctx, pipelineID) if err != nil { @@ -361,8 +312,6 @@ func capturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID RemapPerPeriodSensors(sensors, date) - // Build baseline from post-run rule keys, namespaced by rule key - // to prevent field name collisions between different sensors. baseline := make(map[string]interface{}) for _, rule := range cfg.PostRun.Rules { if data, ok := sensors[rule.Key]; ok { @@ -379,7 +328,7 @@ func capturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID return fmt.Errorf("write baseline: %w", err) } - if err := publishEvent(ctx, d, string(types.EventPostRunBaselineCaptured), pipelineID, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventPostRunBaselineCaptured), pipelineID, scheduleID, date, fmt.Sprintf("post-run baseline captured for %s", pipelineID)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunBaselineCaptured, "error", err) } @@ -387,8 +336,9 @@ func capturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID return nil } -// buildTriggerConfig converts a JobConfig into a TriggerConfig by -// JSON-marshalling the config map and unmarshalling it into the typed sub-struct. +// buildTriggerConfig converts a JobConfig into a TriggerConfig. +// It delegates to the canonical TriggerUnmarshalers registry defined in +// trigger_registry.go. func buildTriggerConfig(job types.JobConfig) (types.TriggerConfig, error) { tc := types.TriggerConfig{Type: job.Type} @@ -401,64 +351,13 @@ func buildTriggerConfig(job types.JobConfig) (types.TriggerConfig, error) { return tc, fmt.Errorf("marshal job config: %w", err) } - switch job.Type { - case types.TriggerHTTP: - var c types.HTTPTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal http config: %w", err) - } - tc.HTTP = &c - case types.TriggerCommand: - var c types.CommandTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal command config: %w", err) - } - tc.Command = &c - case types.TriggerAirflow: - var c types.AirflowTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal airflow config: %w", err) - } - tc.Airflow = &c - case types.TriggerGlue: - var c types.GlueTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal glue config: %w", err) - } - tc.Glue = &c - case types.TriggerEMR: - var c types.EMRTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal emr config: %w", err) - } - tc.EMR = &c - case types.TriggerEMRServerless: - var c types.EMRServerlessTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal emr-serverless config: %w", err) - } - tc.EMRServerless = &c - case types.TriggerStepFunction: - var c types.StepFunctionTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal step-function config: %w", err) - } - tc.StepFunction = &c - case types.TriggerDatabricks: - var c types.DatabricksTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal databricks config: %w", err) - } - tc.Databricks = &c - case types.TriggerLambda: - var c types.LambdaTriggerConfig - if err := json.Unmarshal(data, &c); err != nil { - return tc, fmt.Errorf("unmarshal lambda config: %w", err) - } - tc.Lambda = &c - default: + unmarshal, ok := TriggerUnmarshalers[job.Type] + if !ok { return tc, fmt.Errorf("unsupported trigger type: %s", job.Type) } + if err := unmarshal(data, &tc); err != nil { + return tc, fmt.Errorf("unmarshal %s config: %w", job.Type, err) + } return tc, nil } @@ -468,7 +367,6 @@ func extractRunID(metadata map[string]interface{}) string { if metadata == nil { return "" } - // Priority order of common identifier keys across trigger types. for _, key := range []string{"runId", "jobRunId", "glue_job_run_id", "executionArn", "stepId", "dagRunId"} { if v, ok := metadata[key]; ok { if s, ok := v.(string); ok && s != "" { @@ -480,8 +378,10 @@ func extractRunID(metadata map[string]interface{}) string { } // InjectDateArgs parses the execution date and injects --par_day (and --par_hour -// for hourly dates) into Glue trigger arguments. For HTTP triggers with no -// explicit body, injects a JSON body with par_day and par_hour. +// for hourly dates) into Glue trigger arguments. +// Retained for backward compatibility with existing tests. +// +// Deprecated: Production callers should use orchestrator.InjectDateArgs. func InjectDateArgs(tc *types.TriggerConfig, date string) { datePart, hourPart := ParseExecutionDate(date) parDay := strings.ReplaceAll(datePart, "-", "") @@ -501,17 +401,7 @@ func InjectDateArgs(tc *types.TriggerConfig, date string) { if hourPart != "" { payload["par_hour"] = hourPart } - b, _ := json.Marshal(payload) // json.Marshal is infallible for map[string]string (no channels, funcs, or complex types) + b, _ := json.Marshal(payload) tc.HTTP.Body = string(b) } } - -// ParseExecutionDate splits a composite date into date and hour parts. -// "2026-03-03T10" -> ("2026-03-03", "10") -// "2026-03-03" -> ("2026-03-03", "") -func ParseExecutionDate(date string) (datePart, hourPart string) { - if idx := strings.Index(date, "T"); idx >= 0 { - return date[:idx], date[idx+1:] - } - return date, "" -} diff --git a/internal/lambda/orchestrator/checkjob.go b/internal/lambda/orchestrator/checkjob.go new file mode 100644 index 0000000..79f4e01 --- /dev/null +++ b/internal/lambda/orchestrator/checkjob.go @@ -0,0 +1,77 @@ +package orchestrator + +import ( + "context" + + "github.com/dwsmith1983/interlock/internal/store" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleCheckJob queries the job log for the latest event. If no event exists +// and a StatusChecker is configured, it polls the trigger API directly and +// writes terminal results (succeeded/failed) to the job log. +func handleCheckJob(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + record, err := d.Store.GetLatestJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date) + if err != nil { + return lambda.OrchestratorOutput{Mode: "check-job", Error: err.Error()}, nil + } + + if record != nil { + // Only return terminal events; skip intermediate events like + // infra-trigger-failure so the StatusChecker can poll actual job status. + switch record.Event { + case types.JobEventSuccess, types.JobEventFail, types.JobEventTimeout: + return lambda.OrchestratorOutput{ + Mode: "check-job", + Event: record.Event, + }, nil + } + } + + // No terminal joblog entry — try polling the trigger API directly. + if d.StatusChecker == nil || len(input.Metadata) == 0 { + return lambda.OrchestratorOutput{Mode: "check-job"}, nil + } + + cfg, err := d.Store.GetConfig(ctx, input.PipelineID) + if err != nil { + return lambda.OrchestratorOutput{Mode: "check-job", Error: err.Error()}, nil + } + if cfg == nil { + return lambda.OrchestratorOutput{Mode: "check-job"}, nil + } + + result, err := d.StatusChecker.CheckStatus(ctx, cfg.Job.Type, input.Metadata, nil) + if err != nil { + d.Logger.WarnContext(ctx, "status check failed", "error", err, "pipeline", input.PipelineID) + return lambda.OrchestratorOutput{Mode: "check-job"}, nil + } + + switch result.State { + case "succeeded": + if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventSuccess, input.RunID, 0, ""); err != nil { + d.Logger.Warn("failed to write polled job success joblog", "error", err, "pipeline", input.PipelineID, "schedule", input.ScheduleID, "date", input.Date) + } + // JOB_COMPLETED is published by the stream-router when the JOB# + // record arrives via DynamoDB stream (handleJobSuccess). Publishing + // here as well would cause duplicate alerts for polled jobs. + return lambda.OrchestratorOutput{Mode: "check-job", Event: "success"}, nil + case "failed": + var writeOpts []store.JobEventOption + if result.FailureCategory != "" { + writeOpts = append(writeOpts, store.WithFailureCategory(result.FailureCategory)) + } + if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventFail, input.RunID, 0, result.Message, writeOpts...); err != nil { + d.Logger.Warn("failed to write polled job failure joblog", "error", err, "pipeline", input.PipelineID, "schedule", input.ScheduleID, "date", input.Date) + } + if err := lambda.PublishEvent(ctx, d, string(types.EventJobFailed), input.PipelineID, input.ScheduleID, input.Date, "job failed: "+result.Message); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobFailed, "error", err) + } + return lambda.OrchestratorOutput{Mode: "check-job", Event: "fail"}, nil + default: + // Still running — return no event so SFN loops back to WaitForJob. + return lambda.OrchestratorOutput{Mode: "check-job"}, nil + } +} diff --git a/internal/lambda/orchestrator/complete.go b/internal/lambda/orchestrator/complete.go new file mode 100644 index 0000000..350be17 --- /dev/null +++ b/internal/lambda/orchestrator/complete.go @@ -0,0 +1,41 @@ +package orchestrator + +import ( + "context" + "fmt" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleCompleteTrigger sets the trigger row to its terminal status. +// Success -> COMPLETED; fail/timeout -> FAILED_FINAL. +// On success with PostRun configured, captures a date-scoped baseline snapshot +// of all sensors for later drift comparison by the stream-based post-run evaluator. +func handleCompleteTrigger(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + status := types.TriggerStatusCompleted + if input.Event != types.JobEventSuccess { + status = types.TriggerStatusFailedFinal + } + + if err := d.Store.SetTriggerStatus(ctx, input.PipelineID, input.ScheduleID, input.Date, status); err != nil { + return lambda.OrchestratorOutput{}, fmt.Errorf("set trigger status: %w", err) + } + + // On success, capture post-run baseline for drift detection. + if input.Event == types.JobEventSuccess { + if err := lambda.CapturePostRunBaseline(ctx, d, input.PipelineID, input.ScheduleID, input.Date); err != nil { + d.Logger.WarnContext(ctx, "failed to capture post-run baseline", + "pipeline", input.PipelineID, "error", err) + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventBaselineCaptureFailed), input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("baseline capture failed for %s: %v", input.PipelineID, err)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish baseline capture failure event", "error", pubErr) + } + } + } + + return lambda.OrchestratorOutput{ + Mode: "complete-trigger", + Status: status, + }, nil +} diff --git a/internal/lambda/orchestrator/evaluate.go b/internal/lambda/orchestrator/evaluate.go new file mode 100644 index 0000000..284cb5e --- /dev/null +++ b/internal/lambda/orchestrator/evaluate.go @@ -0,0 +1,48 @@ +package orchestrator + +import ( + "context" + "fmt" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" +) + +// handleEvaluate fetches config and sensors, evaluates validation rules, and +// optionally publishes a VALIDATION_PASSED event. +func handleEvaluate(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + cfg, err := d.Store.GetConfig(ctx, input.PipelineID) + if err != nil { + return lambda.OrchestratorOutput{Mode: "evaluate", Error: err.Error()}, nil + } + if cfg == nil { + return lambda.OrchestratorOutput{Mode: "evaluate", Error: fmt.Sprintf("config not found for pipeline %q", input.PipelineID)}, nil + } + + sensors, err := d.Store.GetAllSensors(ctx, input.PipelineID) + if err != nil { + return lambda.OrchestratorOutput{Mode: "evaluate", Error: err.Error()}, nil + } + + lambda.RemapPerPeriodSensors(sensors, input.Date) + + result := validation.EvaluateRules(cfg.Validation.Trigger, cfg.Validation.Rules, sensors, d.Now()) + + if result.Passed { + if err := lambda.PublishEvent(ctx, d, string(types.EventValidationPassed), input.PipelineID, input.ScheduleID, input.Date, "all validation rules passed"); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventValidationPassed, "error", err) + } + } + + status := "not_ready" + if result.Passed { + status = "passed" + } + + return lambda.OrchestratorOutput{ + Mode: "evaluate", + Status: status, + Results: result.Results, + }, nil +} diff --git a/internal/lambda/orchestrator/exhausted.go b/internal/lambda/orchestrator/exhausted.go new file mode 100644 index 0000000..9bacecf --- /dev/null +++ b/internal/lambda/orchestrator/exhausted.go @@ -0,0 +1,77 @@ +package orchestrator + +import ( + "context" + "fmt" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleValidationExhausted publishes a VALIDATION_EXHAUSTED event when +// the evaluation window closes without all rules passing. +func handleValidationExhausted(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventValidationExhausted, "", 0, "evaluation window exhausted without passing"); err != nil { + return lambda.OrchestratorOutput{}, fmt.Errorf("write validation-exhausted joblog: %w", err) + } + + if err := lambda.PublishEvent(ctx, d, string(types.EventValidationExhausted), input.PipelineID, input.ScheduleID, input.Date, "evaluation window exhausted without passing"); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventValidationExhausted, "error", err) + } + + return lambda.OrchestratorOutput{ + Mode: "validation-exhausted", + Status: "exhausted", + }, nil +} + +// handleJobPollExhausted publishes a JOB_POLL_EXHAUSTED event when the +// job poll window closes without the job reaching a terminal state. +func handleJobPollExhausted(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, + types.JobEventJobPollExhausted, input.RunID, 0, "job poll window exhausted"); err != nil { + return lambda.OrchestratorOutput{}, fmt.Errorf("write job-poll-exhausted joblog: %w", err) + } + + if err := lambda.PublishEvent(ctx, d, string(types.EventJobPollExhausted), input.PipelineID, input.ScheduleID, input.Date, + "job poll window exhausted"); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobPollExhausted, "error", err) + } + + return lambda.OrchestratorOutput{ + Mode: "job-poll-exhausted", + Status: "exhausted", + }, nil +} + +// handleTriggerExhausted publishes RETRY_EXHAUSTED when trigger retries are +// exhausted, writes a joblog entry for audit, and releases the trigger lock +// so the pipeline can be re-triggered. +func handleTriggerExhausted(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + errMsg := "" + if cause, ok := input.ErrorInfo["Cause"].(string); ok { + errMsg = cause + } + + // Dual-write: joblog entry (audit) + EventBridge event (alerting). + if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, + types.JobEventInfraTriggerExhausted, "", 0, errMsg); err != nil { + return lambda.OrchestratorOutput{}, fmt.Errorf("write trigger-exhausted joblog: %w", err) + } + + if err := lambda.PublishEvent(ctx, d, string(types.EventRetryExhausted), input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("trigger retries exhausted for %s: %s", input.PipelineID, errMsg)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRetryExhausted, "error", err) + } + + // Release lock so pipeline can be re-triggered. + if err := d.Store.ReleaseTriggerLock(ctx, input.PipelineID, input.ScheduleID, input.Date); err != nil { + d.Logger.WarnContext(ctx, "failed to release trigger lock after exhaustion", + "pipeline", input.PipelineID, "error", err) + } + + return lambda.OrchestratorOutput{ + Mode: "trigger-exhausted", + Status: "exhausted", + }, nil +} diff --git a/internal/lambda/orchestrator/handler.go b/internal/lambda/orchestrator/handler.go new file mode 100644 index 0000000..61a0f38 --- /dev/null +++ b/internal/lambda/orchestrator/handler.go @@ -0,0 +1,34 @@ +// Package orchestrator implements the multi-mode orchestrator Lambda handler. +// It dispatches to evaluate, trigger, check-job, and lifecycle management modes +// delegated by the Step Function state machine. +package orchestrator + +import ( + "context" + "fmt" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" +) + +// HandleOrchestrator is the entry point for the orchestrator Lambda. +// It dispatches to one of five modes: evaluate, trigger, check-job, post-run, validation-exhausted. +func HandleOrchestrator(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + switch input.Mode { + case "evaluate": + return handleEvaluate(ctx, d, input) + case "trigger": + return handleTrigger(ctx, d, input) + case "check-job": + return handleCheckJob(ctx, d, input) + case "validation-exhausted": + return handleValidationExhausted(ctx, d, input) + case "trigger-exhausted": + return handleTriggerExhausted(ctx, d, input) + case "complete-trigger": + return handleCompleteTrigger(ctx, d, input) + case "job-poll-exhausted": + return handleJobPollExhausted(ctx, d, input) + default: + return lambda.OrchestratorOutput{}, fmt.Errorf("unknown orchestrator mode: %q", input.Mode) + } +} diff --git a/internal/lambda/orchestrator/trigger.go b/internal/lambda/orchestrator/trigger.go new file mode 100644 index 0000000..b330541 --- /dev/null +++ b/internal/lambda/orchestrator/trigger.go @@ -0,0 +1,135 @@ +package orchestrator + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleTrigger builds a TriggerConfig from the JobConfig, executes it, +// publishes JOB_TRIGGERED, and returns the run ID. +func handleTrigger(ctx context.Context, d *lambda.Deps, input lambda.OrchestratorInput) (lambda.OrchestratorOutput, error) { + cfg, err := d.Store.GetConfig(ctx, input.PipelineID) + if err != nil { + return lambda.OrchestratorOutput{Mode: "trigger", Error: err.Error()}, nil + } + if cfg == nil { + return lambda.OrchestratorOutput{Mode: "trigger", Error: fmt.Sprintf("config not found for pipeline %q", input.PipelineID)}, nil + } + + triggerCfg, err := BuildTriggerConfig(cfg.Job) + if err != nil { + return lambda.OrchestratorOutput{Mode: "trigger", Error: fmt.Sprintf("build trigger config: %v", err)}, nil + } + InjectDateArgs(&triggerCfg, input.Date) + + metadata, err := d.TriggerRunner.Execute(ctx, &triggerCfg) + if err != nil { + errMsg := fmt.Sprintf("trigger execute: %v", err) + // Log infra failure to joblog for audit trail, then return Lambda error + // so Step Functions Retry handles exponential backoff. + if writeErr := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, types.JobEventInfraTriggerFailure, "", 0, errMsg); writeErr != nil { + d.Logger.WarnContext(ctx, "failed to write infra trigger failure to joblog", "error", writeErr, "pipeline", input.PipelineID) + } + return lambda.OrchestratorOutput{}, fmt.Errorf("%s", errMsg) + } + + runID := ExtractRunID(metadata) + + if err := lambda.PublishEvent(ctx, d, string(types.EventJobTriggered), input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("triggered %s job", cfg.Job.Type)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobTriggered, "error", err) + } + + // Non-polling triggers (http, command, lambda) complete synchronously + // during Execute. Write success to joblog immediately and set a sentinel + // runId so the Step Functions CheckJob JSONPath resolves. + if metadata == nil { + if err := d.Store.WriteJobEvent(ctx, input.PipelineID, input.ScheduleID, input.Date, + types.JobEventSuccess, "sync", 0, fmt.Sprintf("%s trigger completed synchronously", cfg.Job.Type)); err != nil { + d.Logger.Warn("failed to write sync job success joblog", "error", err, "pipeline", input.PipelineID, "schedule", input.ScheduleID, "date", input.Date) + } + runID = "sync" + metadata = map[string]interface{}{"completedSync": true} + } + + return lambda.OrchestratorOutput{ + Mode: "trigger", + RunID: runID, + JobType: string(cfg.Job.Type), + Metadata: metadata, + }, nil +} + +// BuildTriggerConfig converts a JobConfig into a TriggerConfig by +// JSON-marshalling the config map and unmarshalling it into the typed sub-struct. +// It delegates to the canonical TriggerUnmarshalers registry in the parent +// lambda package to avoid duplication. +func BuildTriggerConfig(job types.JobConfig) (types.TriggerConfig, error) { + tc := types.TriggerConfig{Type: job.Type} + + if len(job.Config) == 0 { + return tc, nil + } + + data, err := json.Marshal(job.Config) + if err != nil { + return tc, fmt.Errorf("marshal job config: %w", err) + } + + unmarshal, ok := lambda.TriggerUnmarshalers[job.Type] + if !ok { + return tc, fmt.Errorf("unsupported trigger type: %s", job.Type) + } + if err := unmarshal(data, &tc); err != nil { + return tc, fmt.Errorf("unmarshal %s config: %w", job.Type, err) + } + + return tc, nil +} + +// ExtractRunID searches trigger metadata for a recognisable run identifier. +func ExtractRunID(metadata map[string]interface{}) string { + if metadata == nil { + return "" + } + // Priority order of common identifier keys across trigger types. + for _, key := range []string{"runId", "jobRunId", "glue_job_run_id", "executionArn", "stepId", "dagRunId"} { + if v, ok := metadata[key]; ok { + if s, ok := v.(string); ok && s != "" { + return s + } + } + } + return "" +} + +// InjectDateArgs parses the execution date and injects --par_day (and --par_hour +// for hourly dates) into Glue trigger arguments. For HTTP triggers with no +// explicit body, injects a JSON body with par_day and par_hour. +func InjectDateArgs(tc *types.TriggerConfig, date string) { + datePart, hourPart := lambda.ParseExecutionDate(date) + parDay := strings.ReplaceAll(datePart, "-", "") + + if tc.Glue != nil { + if tc.Glue.Arguments == nil { + tc.Glue.Arguments = make(map[string]string) + } + tc.Glue.Arguments["--par_day"] = parDay + if hourPart != "" { + tc.Glue.Arguments["--par_hour"] = hourPart + } + } + + if tc.HTTP != nil && tc.HTTP.Body == "" { + payload := map[string]string{"par_day": parDay} + if hourPart != "" { + payload["par_hour"] = hourPart + } + b, _ := json.Marshal(payload) // json.Marshal is infallible for map[string]string (no channels, funcs, or complex types) + tc.HTTP.Body = string(b) + } +} diff --git a/internal/lambda/postrun.go b/internal/lambda/postrun.go index 727badd..9bf0cbe 100644 --- a/internal/lambda/postrun.go +++ b/internal/lambda/postrun.go @@ -5,24 +5,24 @@ import ( "fmt" "strings" - "github.com/dwsmith1983/interlock/internal/validation" "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" ) // defaultDriftField is the sensor field used for drift comparison when // PostRunConfig.DriftField is not set. const defaultDriftField = "sensor_count" -func resolveDriftField(cfg *types.PostRunConfig) string { +func ResolveDriftField(cfg *types.PostRunConfig) string { if cfg.DriftField != "" { return cfg.DriftField } return defaultDriftField } -// matchesPostRunRule returns true if the sensor key matches any post-run rule key +// MatchesPostRunRule returns true if the sensor key matches any post-run rule key // (prefix match to support per-period sensor keys). -func matchesPostRunRule(sensorKey string, rules []types.ValidationRule) bool { +func MatchesPostRunRule(sensorKey string, rules []types.ValidationRule) bool { for _, rule := range rules { if strings.HasPrefix(sensorKey, rule.Key) { return true @@ -40,8 +40,8 @@ func handlePostRunSensorEvent(ctx context.Context, d *Deps, cfg *types.PipelineC return handleDryRunPostRunSensor(ctx, d, cfg, pipelineID, sensorKey, sensorData) } - scheduleID := resolveScheduleID(cfg) - date := ResolveExecutionDate(sensorData, d.now()) + scheduleID := ResolveScheduleID(cfg) + date := ResolveExecutionDate(sensorData, d.Now()) // Consistent read to handle race where sensor stream event arrives // before SFN sets trigger to COMPLETED. @@ -95,14 +95,14 @@ func handlePostRunInflight(ctx context.Context, d *Deps, cfg *types.PipelineConf return nil // No baseline for this rule (stale or first run). } - driftField := resolveDriftField(cfg.PostRun) + driftField := ResolveDriftField(cfg.PostRun) threshold := 0.0 if cfg.PostRun.DriftThreshold != nil { threshold = *cfg.PostRun.DriftThreshold } dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) if dr.Drifted { - if err := publishEvent(ctx, d, string(types.EventPostRunDriftInflight), pipelineID, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventPostRunDriftInflight), pipelineID, scheduleID, date, fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, dr.Previous, dr.Current), map[string]interface{}{ "previousCount": dr.Previous, @@ -144,14 +144,14 @@ func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineCon } if ruleBaseline != nil { - driftField := resolveDriftField(cfg.PostRun) + driftField := ResolveDriftField(cfg.PostRun) threshold := 0.0 if cfg.PostRun.DriftThreshold != nil { threshold = *cfg.PostRun.DriftThreshold } dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold) if dr.Drifted { - if err := publishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date, fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, dr.Previous, dr.Current), map[string]interface{}{ "previousCount": dr.Previous, @@ -167,8 +167,8 @@ func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineCon // Trigger rerun via the existing circuit breaker path only if the // execution date is not excluded by the pipeline's calendar config. - if isExcludedDate(cfg, date) { - if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date, + if IsExcludedDate(cfg, date) { + if pubErr := PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date, fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) } @@ -192,15 +192,15 @@ func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineCon } RemapPerPeriodSensors(sensors, date) - result := validation.EvaluateRules("ALL", cfg.PostRun.Rules, sensors, d.now()) + result := validation.EvaluateRules("ALL", cfg.PostRun.Rules, sensors, d.Now()) if result.Passed { - if err := publishEvent(ctx, d, string(types.EventPostRunPassed), pipelineID, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventPostRunPassed), pipelineID, scheduleID, date, fmt.Sprintf("post-run validation passed for %s", pipelineID)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunPassed, "error", err) } } else { - if err := publishEvent(ctx, d, string(types.EventPostRunFailed), pipelineID, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventPostRunFailed), pipelineID, scheduleID, date, fmt.Sprintf("post-run validation failed for %s", pipelineID)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunFailed, "error", err) } diff --git a/internal/lambda/postrun_baseline.go b/internal/lambda/postrun_baseline.go new file mode 100644 index 0000000..ea7ab24 --- /dev/null +++ b/internal/lambda/postrun_baseline.go @@ -0,0 +1,53 @@ +package lambda + +import ( + "context" + "fmt" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// CapturePostRunBaseline reads all sensors and writes a date-scoped baseline +// snapshot if the pipeline has PostRun config. The baseline is stored as +// "postrun-baseline#" so drift detection can compare against it. +func CapturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) error { + cfg, err := d.Store.GetConfig(ctx, pipelineID) + if err != nil { + return fmt.Errorf("get config: %w", err) + } + if cfg == nil || cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 { + return nil + } + + sensors, err := d.Store.GetAllSensors(ctx, pipelineID) + if err != nil { + return fmt.Errorf("get sensors: %w", err) + } + + RemapPerPeriodSensors(sensors, date) + + // Build baseline from post-run rule keys, namespaced by rule key + // to prevent field name collisions between different sensors. + baseline := make(map[string]interface{}) + for _, rule := range cfg.PostRun.Rules { + if data, ok := sensors[rule.Key]; ok { + baseline[rule.Key] = data + } + } + + if len(baseline) == 0 { + return nil + } + + baselineKey := "postrun-baseline#" + date + if err := d.Store.WriteSensor(ctx, pipelineID, baselineKey, baseline); err != nil { + return fmt.Errorf("write baseline: %w", err) + } + + if err := PublishEvent(ctx, d, string(types.EventPostRunBaselineCaptured), pipelineID, scheduleID, date, + fmt.Sprintf("post-run baseline captured for %s", pipelineID)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunBaselineCaptured, "error", err) + } + + return nil +} diff --git a/internal/lambda/publish.go b/internal/lambda/publish.go new file mode 100644 index 0000000..34e5b49 --- /dev/null +++ b/internal/lambda/publish.go @@ -0,0 +1,62 @@ +package lambda + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/aws/aws-sdk-go-v2/service/eventbridge" + ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// PublishEvent sends an event to EventBridge. It is safe to call when +// EventBridge is nil or EventBusName is empty (returns nil with no action). +func PublishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule, date, message string, detail ...map[string]interface{}) error { + if d.EventBridge == nil || d.EventBusName == "" { + return nil + } + + evt := types.InterlockEvent{ + PipelineID: pipelineID, + ScheduleID: schedule, + Date: date, + Message: message, + Timestamp: d.Now(), + } + if len(detail) > 0 && detail[0] != nil { + evt.Detail = detail[0] + } + detailJSON, err := json.Marshal(evt) + if err != nil { + return fmt.Errorf("marshal event detail: %w", err) + } + + source := types.EventSource + detailStr := string(detailJSON) + + out, err := d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{ + Entries: []ebTypes.PutEventsRequestEntry{ + { + Source: &source, + DetailType: &eventType, + Detail: &detailStr, + EventBusName: &d.EventBusName, + }, + }, + }) + if err != nil { + return fmt.Errorf("publish %s event: %w", eventType, err) + } + if out.FailedEntryCount > 0 { + code, msg := "", "" + if len(out.Entries) > 0 && out.Entries[0].ErrorCode != nil { + code = *out.Entries[0].ErrorCode + if out.Entries[0].ErrorMessage != nil { + msg = *out.Entries[0].ErrorMessage + } + } + return fmt.Errorf("publish %s event: partial failure (code=%s, message=%s)", eventType, code, msg) + } + return nil +} diff --git a/internal/lambda/rerun.go b/internal/lambda/rerun.go index cb94e76..0b45dd7 100644 --- a/internal/lambda/rerun.go +++ b/internal/lambda/rerun.go @@ -25,7 +25,7 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even return err } - cfg, err := getValidatedConfig(ctx, d, pipelineID) + cfg, err := GetValidatedConfig(ctx, d, pipelineID) if err != nil { return fmt.Errorf("load config for %q: %w", pipelineID, err) } @@ -41,11 +41,11 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even } // --- Calendar exclusion check (execution date) --- - if isExcludedDate(cfg, date) { + if IsExcludedDate(cfg, date) { if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, types.JobEventRerunRejected, "", 0, "excluded by calendar"); err != nil { d.Logger.Warn("failed to write rerun-rejected joblog for calendar exclusion", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) } - if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, schedule, date, fmt.Sprintf("rerun blocked for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) } @@ -87,7 +87,7 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even types.JobEventRerunRejected, "", 0, limitLabel); err != nil { d.Logger.Warn("failed to write rerun-rejected joblog for limit exceeded", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) } - if err := publishEvent(ctx, d, string(types.EventRerunRejected), pipelineID, schedule, date, + if err := PublishEvent(ctx, d, string(types.EventRerunRejected), pipelineID, schedule, date, fmt.Sprintf("rerun rejected for %s: %s", pipelineID, limitLabel)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRerunRejected, "error", err) } @@ -121,7 +121,7 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even types.JobEventRerunRejected, "", 0, rejectReason); err != nil { d.Logger.Warn("failed to write rerun-rejected joblog for circuit breaker", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) } - if err := publishEvent(ctx, d, string(types.EventRerunRejected), pipelineID, schedule, date, + if err := PublishEvent(ctx, d, string(types.EventRerunRejected), pipelineID, schedule, date, fmt.Sprintf("rerun rejected for %s: %s", pipelineID, rejectReason)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRerunRejected, "error", err) } @@ -137,7 +137,7 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even return fmt.Errorf("reset trigger lock for %q: %w", pipelineID, err) } if !acquired { - if pubErr := publishEvent(ctx, d, string(types.EventInfraFailure), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventInfraFailure), pipelineID, schedule, date, fmt.Sprintf("lock reset failed for rerun of %s", pipelineID)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "error", pubErr) } @@ -168,13 +168,13 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even d.Logger.Warn("failed to write rerun-accepted joblog", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) } - if pubErr := publishEvent(ctx, d, string(types.EventRerunAccepted), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventRerunAccepted), pipelineID, schedule, date, fmt.Sprintf("rerun accepted for %s (reason: %s)", pipelineID, reason)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRerunAccepted, "error", pubErr) } - execName := truncateExecName(fmt.Sprintf("%s-%s-%s-%s-rerun-%d", pipelineID, schedule, date, reason, d.now().Unix())) - if err := startSFNWithName(ctx, d, cfg, pipelineID, schedule, date, execName); err != nil { + execName := TruncateExecName(fmt.Sprintf("%s-%s-%s-%s-rerun-%d", pipelineID, schedule, date, reason, d.Now().Unix())) + if err := StartSFNWithName(ctx, d, cfg, pipelineID, schedule, date, execName); err != nil { if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil { d.Logger.Warn("failed to release lock after SFN start failure", "error", relErr) } @@ -200,7 +200,7 @@ func parseRerunRequestSK(sk string) (schedule, date string, err error) { // handleJobFailure processes a job failure or timeout by either re-running // the pipeline (if under the retry limit) or marking it as permanently failed. func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, jobEvent string) error { - cfg, err := getValidatedConfig(ctx, d, pipelineID) + cfg, err := GetValidatedConfig(ctx, d, pipelineID) if err != nil { return fmt.Errorf("load config for %q: %w", pipelineID, err) } @@ -237,7 +237,7 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, if rerunCount >= maxRetries { // Retry limit reached — publish exhaustion event and mark as final failure. - if err := publishEvent(ctx, d, string(types.EventRetryExhausted), pipelineID, schedule, date, + if err := PublishEvent(ctx, d, string(types.EventRetryExhausted), pipelineID, schedule, date, fmt.Sprintf("retry limit reached (%d/%d) for %s", rerunCount, maxRetries, pipelineID)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRetryExhausted, "error", err) } @@ -258,11 +258,11 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, // Calendar exclusion check: skip retry if the execution date is excluded. // Mark trigger as terminal so the lock doesn't silently expire via TTL. - if isExcludedDate(cfg, date) { + if IsExcludedDate(cfg, date) { if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { d.Logger.WarnContext(ctx, "failed to set trigger status after calendar exclusion", "error", err) } - if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, schedule, date, fmt.Sprintf("job failure retry skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) } @@ -286,8 +286,8 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, } // Use a unique execution name that includes the rerun attempt number. - execName := truncateExecName(fmt.Sprintf("%s-%s-%s-rerun-%d", pipelineID, schedule, date, attempt)) - if err := startSFNWithName(ctx, d, cfg, pipelineID, schedule, date, execName); err != nil { + execName := TruncateExecName(fmt.Sprintf("%s-%s-%s-rerun-%d", pipelineID, schedule, date, attempt)) + if err := StartSFNWithName(ctx, d, cfg, pipelineID, schedule, date, execName); err != nil { if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil { d.Logger.Warn("failed to release lock after SFN start failure", "error", relErr) } @@ -308,8 +308,8 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date, // side effects. Mirrors the production handleRerunRequest logic. func handleDryRunRerunRequest(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string, record events.DynamoDBEventRecord) error { // Calendar exclusion check. - if isExcludedDate(cfg, date) { - if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + if IsExcludedDate(cfg, date) { + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, fmt.Sprintf("dry-run: rerun rejected for %s: execution date %s excluded by calendar", pipelineID, date), map[string]interface{}{ "reason": "excluded by calendar", @@ -347,7 +347,7 @@ func handleDryRunRerunRequest(ctx context.Context, d *Deps, cfg *types.PipelineC } if count >= budget { - if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, fmt.Sprintf("dry-run: rerun rejected for %s: limit exceeded (%d/%d)", pipelineID, count, budget), map[string]interface{}{ "reason": "limit exceeded", @@ -374,7 +374,7 @@ func handleDryRunRerunRequest(ctx context.Context, d *Deps, cfg *types.PipelineC return fmt.Errorf("dry-run: check sensor freshness for %q: %w", pipelineID, freshErr) } if !fresh { - if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, fmt.Sprintf("dry-run: rerun rejected for %s: previous run succeeded and no sensor data has changed", pipelineID), map[string]interface{}{ "reason": "circuit breaker", @@ -387,7 +387,7 @@ func handleDryRunRerunRequest(ctx context.Context, d *Deps, cfg *types.PipelineC } // All checks pass — publish would-rerun event. - if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRerun), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunWouldRerun), pipelineID, schedule, date, fmt.Sprintf("dry-run: would rerun %s (reason: %s)", pipelineID, reason), map[string]interface{}{ "reason": reason, @@ -427,7 +427,7 @@ func handleDryRunJobFailure(ctx context.Context, d *Deps, cfg *types.PipelineCon } if rerunCount >= maxRetries { - if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, fmt.Sprintf("dry-run: retry limit reached (%d/%d) for %s", rerunCount, maxRetries, pipelineID), map[string]interface{}{ "retries": rerunCount, @@ -439,8 +439,8 @@ func handleDryRunJobFailure(ctx context.Context, d *Deps, cfg *types.PipelineCon } // Calendar exclusion check. - if isExcludedDate(cfg, date) { - if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + if IsExcludedDate(cfg, date) { + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, fmt.Sprintf("dry-run: retry skipped for %s: execution date %s excluded by calendar", pipelineID, date), map[string]interface{}{ "reason": "excluded by calendar", @@ -453,7 +453,7 @@ func handleDryRunJobFailure(ctx context.Context, d *Deps, cfg *types.PipelineCon } // Under budget — publish would-retry event. - if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRetry), pipelineID, schedule, date, + if pubErr := PublishEvent(ctx, d, string(types.EventDryRunWouldRetry), pipelineID, schedule, date, fmt.Sprintf("dry-run: would retry %s (%d/%d)", pipelineID, rerunCount, maxRetries), map[string]interface{}{ "retries": rerunCount, @@ -569,7 +569,7 @@ func checkLateDataArrival(ctx context.Context, d *Deps, pipelineID, schedule, da d.Logger.Warn("failed to write late-data-arrival joblog", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) } - if err := publishEvent(ctx, d, string(types.EventLateDataArrival), pipelineID, schedule, date, + if err := PublishEvent(ctx, d, string(types.EventLateDataArrival), pipelineID, schedule, date, fmt.Sprintf("late data arrival for %s: sensor updated after job completion", pipelineID)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventLateDataArrival, "error", err) } diff --git a/internal/lambda/schedule.go b/internal/lambda/schedule.go new file mode 100644 index 0000000..ff2ec42 --- /dev/null +++ b/internal/lambda/schedule.go @@ -0,0 +1,33 @@ +package lambda + +import ( + "os" + "strconv" + "time" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// ResolveScheduleID returns "cron" if the pipeline uses a cron schedule, +// otherwise returns "stream". +func ResolveScheduleID(cfg *types.PipelineConfig) string { + if cfg.Schedule.Cron != "" { + return "cron" + } + return "stream" +} + +// ResolveTriggerLockTTL returns the trigger lock TTL based on the +// SFN_TIMEOUT_SECONDS env var plus a 30-minute buffer. Defaults to +// 4h30m if the env var is not set or invalid. +func ResolveTriggerLockTTL() time.Duration { + s := os.Getenv("SFN_TIMEOUT_SECONDS") + if s == "" { + return DefaultTriggerLockTTL + } + sec, err := strconv.Atoi(s) + if err != nil || sec <= 0 { + return DefaultTriggerLockTTL + } + return time.Duration(sec)*time.Second + TriggerLockBuffer +} diff --git a/internal/lambda/sensor.go b/internal/lambda/sensor.go new file mode 100644 index 0000000..d717b9f --- /dev/null +++ b/internal/lambda/sensor.go @@ -0,0 +1,113 @@ +package lambda + +import ( + "strconv" + "strings" + + "github.com/aws/aws-lambda-go/events" +) + +// ExtractKeys returns the PK and SK string values from a DynamoDB stream record. +func ExtractKeys(record events.DynamoDBEventRecord) (pk, sk string) { + keys := record.Change.Keys + if pkAttr, ok := keys["PK"]; ok && pkAttr.DataType() == events.DataTypeString { + pk = pkAttr.String() + } + if skAttr, ok := keys["SK"]; ok && skAttr.DataType() == events.DataTypeString { + sk = skAttr.String() + } + return pk, sk +} + +// ExtractSensorData converts a DynamoDB stream NewImage to a plain map +// suitable for validation rule evaluation. If the item uses the canonical +// ControlRecord format (sensor fields nested inside a "data" map attribute), +// the "data" map is unwrapped so fields are accessible at the top level. +func ExtractSensorData(newImage map[string]events.DynamoDBAttributeValue) map[string]interface{} { + if newImage == nil { + return nil + } + + skipKeys := map[string]bool{"PK": true, "SK": true, "ttl": true} + result := make(map[string]interface{}, len(newImage)) + + for k, av := range newImage { + if skipKeys[k] { + continue + } + result[k] = ConvertAttributeValue(av) + } + + // Unwrap the "data" map if present (canonical ControlRecord sensor format). + if dataMap, ok := result["data"].(map[string]interface{}); ok { + return dataMap + } + return result +} + +// ConvertAttributeValue converts a DynamoDB stream attribute value to a Go native type. +func ConvertAttributeValue(av events.DynamoDBAttributeValue) interface{} { + switch av.DataType() { + case events.DataTypeString: + return av.String() + case events.DataTypeNumber: + // Try int first, fall back to float. + if i, err := strconv.ParseInt(av.Number(), 10, 64); err == nil { + return float64(i) + } + if f, err := strconv.ParseFloat(av.Number(), 64); err == nil { + return f + } + return av.Number() + case events.DataTypeBoolean: + return av.Boolean() + case events.DataTypeNull: + return nil + case events.DataTypeMap: + m := av.Map() + out := make(map[string]interface{}, len(m)) + for k, v := range m { + out[k] = ConvertAttributeValue(v) + } + return out + case events.DataTypeList: + l := av.List() + out := make([]interface{}, len(l)) + for i, v := range l { + out[i] = ConvertAttributeValue(v) + } + return out + default: + return nil + } +} + +// RemapPerPeriodSensors adds base-key aliases for per-period sensor keys. +// For example, sensor "hourly-status#20260303T07" becomes accessible under +// key "hourly-status" when the execution date is "2026-03-03T07". This allows +// validation rules with key "hourly-status" to match per-period sensor records. +// Handles both normalized (2026-03-03) and compact (20260303) date formats. +func RemapPerPeriodSensors(sensors map[string]map[string]interface{}, date string) { + if date == "" { + return + } + // Build candidate suffixes: the normalized date and compact form. + suffixes := []string{"#" + date} + compact := strings.ReplaceAll(date, "-", "") + if compact != date { + suffixes = append(suffixes, "#"+compact) + } + additions := make(map[string]map[string]interface{}) + for key, data := range sensors { + for _, suffix := range suffixes { + if strings.HasSuffix(key, suffix) { + base := strings.TrimSuffix(key, suffix) + additions[base] = data + break + } + } + } + for k, v := range additions { + sensors[k] = v + } +} diff --git a/internal/lambda/sfn.go b/internal/lambda/sfn.go index 2942e9d..4901f4b 100644 --- a/internal/lambda/sfn.go +++ b/internal/lambda/sfn.go @@ -29,8 +29,8 @@ type sfnConfig struct { SLA *types.SLAConfig `json:"sla,omitempty"` } -// buildSFNConfig converts a PipelineConfig into the config block for the SFN input. -func buildSFNConfig(cfg *types.PipelineConfig) sfnConfig { +// BuildSFNConfig converts a PipelineConfig into the config block for the SFN input. +func BuildSFNConfig(cfg *types.PipelineConfig) sfnConfig { sc := sfnConfig{ EvaluationIntervalSeconds: DefaultEvalIntervalSec, EvaluationWindowSeconds: DefaultEvalWindowSec, @@ -60,34 +60,34 @@ func buildSFNConfig(cfg *types.PipelineConfig) sfnConfig { return sc } -// truncateExecName ensures an SFN execution name does not exceed the 80-character +// TruncateExecName ensures an SFN execution name does not exceed the 80-character // AWS limit. When truncation is needed the suffix (date + timestamp) is preserved // by trimming characters from the beginning of the name. -func truncateExecName(name string) string { +func TruncateExecName(name string) string { if len(name) <= SFNExecNameMaxLen { return name } return name[len(name)-SFNExecNameMaxLen:] } -// startSFN starts a Step Function execution with a unique execution name. +// StartSFN starts a Step Function execution with a unique execution name. // The name includes a Unix timestamp suffix to avoid ExecutionAlreadyExists // errors when a previous execution for the same pipeline/schedule/date failed. -func startSFN(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date string) error { - name := truncateExecName(fmt.Sprintf("%s-%s-%s-%d", pipelineID, scheduleID, date, d.now().Unix())) - return startSFNWithName(ctx, d, cfg, pipelineID, scheduleID, date, name) +func StartSFN(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date string) error { + name := TruncateExecName(fmt.Sprintf("%s-%s-%s-%d", pipelineID, scheduleID, date, d.Now().Unix())) + return StartSFNWithName(ctx, d, cfg, pipelineID, scheduleID, date, name) } -// startSFNWithName starts a Step Function execution with a custom execution name. +// StartSFNWithName starts a Step Function execution with a custom execution name. // Defense-in-depth: refuses to start if the pipeline is in dry-run mode. -func startSFNWithName(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date, name string) error { +func StartSFNWithName(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date, name string) error { if cfg.DryRun { - d.Logger.Warn("startSFNWithName called for dry-run pipeline, suppressing execution", + d.Logger.Warn("StartSFNWithName called for dry-run pipeline, suppressing execution", "pipelineId", pipelineID, "schedule", scheduleID, "date", date) return nil } - sc := buildSFNConfig(cfg) + sc := BuildSFNConfig(cfg) // Warn if the sum of evaluation + poll windows exceeds the SFN timeout. totalWindowSec := sc.EvaluationWindowSeconds + sc.JobPollWindowSeconds diff --git a/internal/lambda/sink/handler.go b/internal/lambda/sink/handler.go new file mode 100644 index 0000000..6082038 --- /dev/null +++ b/internal/lambda/sink/handler.go @@ -0,0 +1,62 @@ +// Package sink implements the event-sink Lambda handler. +// It writes EventBridge events to DynamoDB for audit trail. +package sink + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/aws/aws-sdk-go-v2/service/dynamodb" + ddbtypes "github.com/aws/aws-sdk-go-v2/service/dynamodb/types" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// HandleEventSink writes an EventBridge event to the centralized events table. +func HandleEventSink(ctx context.Context, d *lambda.Deps, input lambda.EventBridgeInput) error { + var detail types.InterlockEvent + if err := json.Unmarshal(input.Detail, &detail); err != nil { + return fmt.Errorf("unmarshal event detail: %w", err) + } + + var tsMillis int64 + if !detail.Timestamp.IsZero() { + tsMillis = detail.Timestamp.UnixMilli() + } else { + tsMillis = d.Now().UnixMilli() + } + + now := d.Now() + ttlDays := d.EventsTTLDays + if ttlDays <= 0 { + ttlDays = 90 + } + ttl := now.Add(time.Duration(ttlDays) * 24 * time.Hour).Unix() + sk := fmt.Sprintf("%d#%s", tsMillis, input.DetailType) + + item := map[string]ddbtypes.AttributeValue{ + "PK": &ddbtypes.AttributeValueMemberS{Value: types.PipelinePK(detail.PipelineID)}, + "SK": &ddbtypes.AttributeValueMemberS{Value: sk}, + "eventType": &ddbtypes.AttributeValueMemberS{Value: input.DetailType}, + "pipelineId": &ddbtypes.AttributeValueMemberS{Value: detail.PipelineID}, + "scheduleId": &ddbtypes.AttributeValueMemberS{Value: detail.ScheduleID}, + "date": &ddbtypes.AttributeValueMemberS{Value: detail.Date}, + "message": &ddbtypes.AttributeValueMemberS{Value: detail.Message}, + "timestamp": &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", tsMillis)}, + "ttl": &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", ttl)}, + } + + _, err := d.Store.Client.PutItem(ctx, &dynamodb.PutItemInput{ + TableName: &d.Store.EventsTable, + Item: item, + }) + if err != nil { + return fmt.Errorf("write event to events table (pipeline=%s type=%s): %w", detail.PipelineID, input.DetailType, err) + } + + d.Logger.InfoContext(ctx, "event written", "pipeline", detail.PipelineID, "eventType", input.DetailType, "sk", sk) + return nil +} diff --git a/internal/lambda/sla/alert.go b/internal/lambda/sla/alert.go new file mode 100644 index 0000000..e6ac5cd --- /dev/null +++ b/internal/lambda/sla/alert.go @@ -0,0 +1,81 @@ +package sla + +import ( + "context" + "fmt" + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleSLAFireAlert publishes an SLA alert event to EventBridge. +func handleSLAFireAlert(ctx context.Context, d *lambda.Deps, input lambda.SLAMonitorInput) (lambda.SLAMonitorOutput, error) { + var tr *types.ControlRecord + if d.Store != nil { + suppressed := false + var err error + tr, err = d.Store.GetTrigger(ctx, input.PipelineID, input.ScheduleID, input.Date) + switch { + case err != nil: + d.Logger.WarnContext(ctx, "trigger lookup failed in fire-alert, proceeding with alert", + "pipeline", input.PipelineID, "error", err) + case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal): + d.Logger.InfoContext(ctx, "suppressing SLA alert (pipeline already finished)", + "pipeline", input.PipelineID, "date", input.Date, "triggerStatus", tr.Status, "alertType", input.AlertType) + suppressed = true + case lambda.IsJobTerminal(ctx, d, input.PipelineID, input.ScheduleID, input.Date): + d.Logger.InfoContext(ctx, "suppressing SLA alert (terminal joblog event found)", + "pipeline", input.PipelineID, "date", input.Date, "alertType", input.AlertType) + suppressed = true + } + if suppressed { + return lambda.SLAMonitorOutput{AlertType: input.AlertType, FiredAt: d.Now().UTC().Format(time.RFC3339)}, nil + } + } + + if input.AlertType == "SLA_WARNING" && input.BreachAt != "" { + breachAt, err := time.Parse(time.RFC3339, input.BreachAt) + if err == nil && !d.Now().UTC().Before(breachAt) { + d.Logger.InfoContext(ctx, "suppressing SLA_WARNING (past breach time)", + "pipeline", input.PipelineID, "breachAt", input.BreachAt) + return lambda.SLAMonitorOutput{AlertType: input.AlertType, FiredAt: d.Now().UTC().Format(time.RFC3339)}, nil + } + } + + status := "not started" + if tr != nil { + status = tr.Status + } + source := "schedule" + actionHint := "pipeline not started — check sensor data" + switch { + case status == types.TriggerStatusRunning: + actionHint = "pipeline running — may complete before breach" + case status == "not started" && input.AlertType == "SLA_BREACH": + actionHint = "pipeline not started — investigate trigger" + } + + alertDetail := map[string]interface{}{ + "status": status, + "source": source, + "actionHint": actionHint, + } + if input.BreachAt != "" { + alertDetail["breachAt"] = input.BreachAt + } + if input.Deadline != "" { + alertDetail["deadline"] = input.Deadline + } + + msg := fmt.Sprintf("pipeline %s: %s", input.PipelineID, input.AlertType) + + if err := lambda.PublishEvent(ctx, d, input.AlertType, input.PipelineID, input.ScheduleID, input.Date, msg, alertDetail); err != nil { + return lambda.SLAMonitorOutput{}, fmt.Errorf("publish SLA event: %w", err) + } + + return lambda.SLAMonitorOutput{ + AlertType: input.AlertType, + FiredAt: d.Now().UTC().Format(time.RFC3339), + }, nil +} diff --git a/internal/lambda/sla/calculate.go b/internal/lambda/sla/calculate.go new file mode 100644 index 0000000..e68d741 --- /dev/null +++ b/internal/lambda/sla/calculate.go @@ -0,0 +1,41 @@ +package sla + +import ( + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + pkgsla "github.com/dwsmith1983/interlock/pkg/sla" +) + +// handleSLACalculate computes warning and breach times. Supports two modes: +// +// 1. Schedule-based (deadline): delegates to pkgsla.CalculateAbsoluteDeadline. +// 2. Relative (maxDuration + sensorArrivalAt): delegates to pkgsla.CalculateRelativeDeadline. +func handleSLACalculate(input lambda.SLAMonitorInput, now time.Time) (lambda.SLAMonitorOutput, error) { + if input.MaxDuration != "" && input.SensorArrivalAt != "" { + return handleRelativeSLACalculate(input) + } + breach, warning, err := pkgsla.CalculateAbsoluteDeadline( + input.Date, input.Deadline, input.ExpectedDuration, input.Timezone, now) + if err != nil { + return lambda.SLAMonitorOutput{}, err + } + return lambda.SLAMonitorOutput{ + WarningAt: warning.UTC().Format(time.RFC3339), + BreachAt: breach.UTC().Format(time.RFC3339), + }, nil +} + +// handleRelativeSLACalculate computes warning and breach times from +// sensorArrivalAt + maxDuration, delegating to pkgsla.CalculateRelativeDeadline. +func handleRelativeSLACalculate(input lambda.SLAMonitorInput) (lambda.SLAMonitorOutput, error) { + breach, warning, err := pkgsla.CalculateRelativeDeadline( + input.SensorArrivalAt, input.MaxDuration, input.ExpectedDuration) + if err != nil { + return lambda.SLAMonitorOutput{}, err + } + return lambda.SLAMonitorOutput{ + WarningAt: warning.UTC().Format(time.RFC3339), + BreachAt: breach.UTC().Format(time.RFC3339), + }, nil +} diff --git a/internal/lambda/sla/cancel.go b/internal/lambda/sla/cancel.go new file mode 100644 index 0000000..945b9af --- /dev/null +++ b/internal/lambda/sla/cancel.go @@ -0,0 +1,95 @@ +package sla + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/scheduler" + schedulerTypes "github.com/aws/aws-sdk-go-v2/service/scheduler/types" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleSLACancel deletes unfired SLA schedules and determines the final SLA outcome. +func handleSLACancel(ctx context.Context, d *lambda.Deps, input lambda.SLAMonitorInput) (lambda.SLAMonitorOutput, error) { + if input.WarningAt == "" && input.BreachAt == "" { + if input.MaxDuration != "" && input.SensorArrivalAt != "" { + calc, err := handleRelativeSLACalculate(input) + if err != nil { + return lambda.SLAMonitorOutput{}, fmt.Errorf("cancel recalculate (relative): %w", err) + } + input.WarningAt = calc.WarningAt + input.BreachAt = calc.BreachAt + } else if input.Deadline != "" { + calc, err := handleSLACalculate(input, d.Now()) + if err != nil { + return lambda.SLAMonitorOutput{}, fmt.Errorf("cancel recalculate: %w", err) + } + input.WarningAt = calc.WarningAt + input.BreachAt = calc.BreachAt + } + } + + if d.Scheduler != nil { + for _, suffix := range []string{"warning", "breach"} { + name := SLAScheduleName(input.PipelineID, input.ScheduleID, input.Date, suffix) + _, err := d.Scheduler.DeleteSchedule(ctx, &scheduler.DeleteScheduleInput{ + Name: aws.String(name), + GroupName: aws.String(d.SchedulerGroupName), + }) + if err != nil { + var rnf *schedulerTypes.ResourceNotFoundException + if !errors.As(err, &rnf) { + d.Logger.WarnContext(ctx, "delete schedule failed", "name", name, "error", err) + } + } + } + } + + now := d.Now().UTC() + alertType := string(types.EventSLAMet) + if input.BreachAt != "" { + breachAt, _ := time.Parse(time.RFC3339, input.BreachAt) + if !breachAt.IsZero() && (now.After(breachAt) || now.Equal(breachAt)) { + alertType = string(types.EventSLABreach) + } + } + + publish := true + if d.Store != nil { + tr, err := d.Store.GetTrigger(ctx, input.PipelineID, input.ScheduleID, input.Date) + if err != nil { + d.Logger.WarnContext(ctx, "trigger lookup failed in cancel, proceeding with verdict", + "pipeline", input.PipelineID, "error", err) + } else if tr == nil { + d.Logger.InfoContext(ctx, "skipping SLA verdict — pipeline was never triggered", + "pipeline", input.PipelineID, "date", input.Date, "alertType", alertType) + publish = false + } + } + + d.Logger.InfoContext(ctx, "cancelled SLA schedules", + "pipeline", input.PipelineID, "alertType", alertType) + if publish { + if err := lambda.PublishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil { + return lambda.SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err) + } + } + + return lambda.SLAMonitorOutput{ + AlertType: alertType, + WarningAt: input.WarningAt, + BreachAt: input.BreachAt, + FiredAt: now.Format(time.RFC3339), + }, nil +} + +// SLAScheduleName returns a deterministic EventBridge Scheduler name for an SLA alert. +func SLAScheduleName(pipelineID, scheduleID, date, suffix string) string { + return fmt.Sprintf("%s-%s-%s-sla-%s", pipelineID, scheduleID, date, suffix) +} diff --git a/internal/lambda/sla/handler.go b/internal/lambda/sla/handler.go new file mode 100644 index 0000000..1c3b795 --- /dev/null +++ b/internal/lambda/sla/handler.go @@ -0,0 +1,29 @@ +// Package sla implements the SLA monitor Lambda handler. +// It calculates deadlines, schedules/cancels EventBridge Scheduler entries, +// fires warning/breach alerts, and reconciles missed alerts. +package sla + +import ( + "context" + "fmt" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" +) + +// HandleSLAMonitor processes SLA monitor requests from Step Functions. +func HandleSLAMonitor(ctx context.Context, d *lambda.Deps, input lambda.SLAMonitorInput) (lambda.SLAMonitorOutput, error) { + switch input.Mode { + case "calculate": + return handleSLACalculate(input, d.Now()) + case "fire-alert": + return handleSLAFireAlert(ctx, d, input) + case "schedule": + return handleSLASchedule(ctx, d, input) + case "cancel": + return handleSLACancel(ctx, d, input) + case "reconcile": + return handleSLAReconcile(ctx, d, input) + default: + return lambda.SLAMonitorOutput{}, fmt.Errorf("unknown SLA monitor mode: %q", input.Mode) + } +} diff --git a/internal/lambda/sla/reconcile.go b/internal/lambda/sla/reconcile.go new file mode 100644 index 0000000..29a9abb --- /dev/null +++ b/internal/lambda/sla/reconcile.go @@ -0,0 +1,54 @@ +package sla + +import ( + "context" + "fmt" + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" +) + +// handleSLAReconcile calculates deadlines and fires any alerts for deadlines +// that have already passed. +func handleSLAReconcile(ctx context.Context, d *lambda.Deps, input lambda.SLAMonitorInput) (lambda.SLAMonitorOutput, error) { + calc, err := handleSLACalculate(input, d.Now()) + if err != nil { + return lambda.SLAMonitorOutput{}, fmt.Errorf("reconcile: %w", err) + } + + now := d.Now().UTC() + warningAt, _ := time.Parse(time.RFC3339, calc.WarningAt) + breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) + + reconcileDetail := map[string]interface{}{ + "source": "reconciliation", + "warningAt": calc.WarningAt, + "breachAt": calc.BreachAt, + "actionHint": "fired by reconciliation fallback — check Scheduler health", + } + + var alertType string + switch { + case now.After(breachAt) || now.Equal(breachAt): + if err := lambda.PublishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_BREACH", "error", err) + } + alertType = "SLA_BREACH" + case now.After(warningAt) || now.Equal(warningAt): + if err := lambda.PublishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date, + fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_WARNING", "error", err) + } + alertType = "SLA_WARNING" + default: + alertType = "SLA_MET" + } + + return lambda.SLAMonitorOutput{ + AlertType: alertType, + WarningAt: calc.WarningAt, + BreachAt: calc.BreachAt, + FiredAt: now.Format(time.RFC3339), + }, nil +} diff --git a/internal/lambda/sla/schedule.go b/internal/lambda/sla/schedule.go new file mode 100644 index 0000000..28af348 --- /dev/null +++ b/internal/lambda/sla/schedule.go @@ -0,0 +1,35 @@ +package sla + +import ( + "context" + "fmt" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" +) + +// handleSLASchedule creates one-time EventBridge Scheduler entries for the +// SLA warning and breach times. +func handleSLASchedule(ctx context.Context, d *lambda.Deps, input lambda.SLAMonitorInput) (lambda.SLAMonitorOutput, error) { + calc, err := handleSLACalculate(input, d.Now()) + if err != nil { + return lambda.SLAMonitorOutput{}, fmt.Errorf("schedule: %w", err) + } + + if d.Scheduler == nil { + d.Logger.WarnContext(ctx, "scheduler not configured, skipping SLA schedule creation", + "pipeline", input.PipelineID) + return calc, nil + } + + if err := lambda.CreateSLASchedules(ctx, d, input.PipelineID, input.ScheduleID, input.Date, calc, false); err != nil { + return lambda.SLAMonitorOutput{}, err + } + + d.Logger.InfoContext(ctx, "scheduled SLA alerts", + "pipeline", input.PipelineID, + "warningAt", calc.WarningAt, + "breachAt", calc.BreachAt, + ) + + return calc, nil +} diff --git a/internal/lambda/sla_monitor.go b/internal/lambda/sla_monitor.go index 5093fa8..76c0117 100644 --- a/internal/lambda/sla_monitor.go +++ b/internal/lambda/sla_monitor.go @@ -5,27 +5,21 @@ import ( "encoding/json" "errors" "fmt" - "strconv" "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/scheduler" schedulerTypes "github.com/aws/aws-sdk-go-v2/service/scheduler/types" + pkgsla "github.com/dwsmith1983/interlock/pkg/sla" "github.com/dwsmith1983/interlock/pkg/types" ) -// HandleSLAMonitor processes SLA monitor requests from Step Functions. -// It supports five modes: -// - "calculate": computes warning and breach times from schedule config -// - "fire-alert": publishes an SLA alert event to EventBridge -// - "schedule": creates one-time EventBridge Scheduler entries for warning/breach -// - "cancel": deletes unfired schedules and publishes SLA_MET if applicable -// - "reconcile": computes deadlines and fires any that have already passed (fallback) +// Deprecated: Use sla.HandleSLAMonitor instead. Retained for test compatibility. func HandleSLAMonitor(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMonitorOutput, error) { switch input.Mode { case "calculate": - return handleSLACalculate(input, d.now()) + return handleSLACalculate(input, d.Now()) case "fire-alert": return handleSLAFireAlert(ctx, d, input) case "schedule": @@ -39,6 +33,12 @@ func HandleSLAMonitor(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAM } } +// HandleSLACalculate is the exported entry point for SLA calculation. +// Used by the stream sub-package for dry-run SLA projections. +func HandleSLACalculate(input SLAMonitorInput, now time.Time) (SLAMonitorOutput, error) { + return handleSLACalculate(input, now) +} + // handleSLACalculate computes warning and breach times. Supports two modes: // // 1. Schedule-based (deadline): breachAt = deadline, warningAt = deadline - expectedDuration. @@ -47,86 +47,17 @@ func HandleSLAMonitor(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAM // // Returns full ISO 8601 timestamps required by Step Functions TimestampPath. func handleSLACalculate(input SLAMonitorInput, now time.Time) (SLAMonitorOutput, error) { - // Relative SLA path: maxDuration + sensorArrivalAt. if input.MaxDuration != "" && input.SensorArrivalAt != "" { return handleRelativeSLACalculate(input) } - - dur, err := time.ParseDuration(input.ExpectedDuration) + breach, warning, err := pkgsla.CalculateAbsoluteDeadline( + input.Date, input.Deadline, input.ExpectedDuration, input.Timezone, now) if err != nil { - return SLAMonitorOutput{}, fmt.Errorf("parse expectedDuration %q: %w", input.ExpectedDuration, err) - } - - loc := time.UTC - if input.Timezone != "" { - loc, err = time.LoadLocation(input.Timezone) - if err != nil { - return SLAMonitorOutput{}, fmt.Errorf("load timezone %q: %w", input.Timezone, err) - } - } - - now = now.In(loc) - - // Parse the execution date. Supports: - // "2006-01-02" — daily - // "2006-01-02T15" — hourly (hour encoded in date) - baseDate := now - baseHour := -1 // -1 means "use current hour" for relative deadlines - if input.Date != "" { - datePart, hourPart := ParseExecutionDate(input.Date) - parsed, err := time.Parse("2006-01-02", datePart) - if err == nil { - if hourPart != "" { - h := 0 - if parsed, atoiErr := strconv.Atoi(hourPart); atoiErr == nil { - h = parsed - baseHour = h - } - baseDate = time.Date(parsed.Year(), parsed.Month(), parsed.Day(), - h, 0, 0, 0, loc) - } else { - baseDate = time.Date(parsed.Year(), parsed.Month(), parsed.Day(), - now.Hour(), now.Minute(), 0, 0, loc) - } - } - } - - // Parse deadline. Supports two formats: - // "HH:MM" — absolute time of day (e.g., "02:00" for daily pipelines) - // ":MM" — minutes past current hour (e.g., ":30" for hourly pipelines) - var breachAt time.Time - dl := input.Deadline - if strings.HasPrefix(dl, ":") { - deadline, err := time.Parse("04", strings.TrimPrefix(dl, ":")) - if err != nil { - return SLAMonitorOutput{}, fmt.Errorf("parse deadline %q: %w", dl, err) - } - hour := baseDate.Hour() - breachAt = time.Date(baseDate.Year(), baseDate.Month(), baseDate.Day(), - hour, deadline.Minute(), 0, 0, loc) - if baseHour >= 0 { - // Hourly pipeline: data for hour H is processed in hour H+1, - // so ":MM" means MM minutes into the processing window (H+1). - breachAt = breachAt.Add(time.Hour) - } else if breachAt.Before(now) { - breachAt = breachAt.Add(time.Hour) - } - } else { - deadline, err := time.Parse("15:04", dl) - if err != nil { - return SLAMonitorOutput{}, fmt.Errorf("parse deadline %q: %w", dl, err) - } - breachAt = time.Date(baseDate.Year(), baseDate.Month(), baseDate.Day(), - deadline.Hour(), deadline.Minute(), 0, 0, loc) - if breachAt.Before(now) { - breachAt = breachAt.Add(24 * time.Hour) - } + return SLAMonitorOutput{}, err } - warningAt := breachAt.Add(-dur) - return SLAMonitorOutput{ - WarningAt: warningAt.UTC().Format(time.RFC3339), - BreachAt: breachAt.UTC().Format(time.RFC3339), + WarningAt: warning.UTC().Format(time.RFC3339), + BreachAt: breach.UTC().Format(time.RFC3339), }, nil } @@ -135,33 +66,14 @@ func handleSLACalculate(input SLAMonitorInput, now time.Time) (SLAMonitorOutput, // if provided, otherwise defaults to 25% of maxDuration (i.e. warning // fires at 75% of the total allowed time). func handleRelativeSLACalculate(input SLAMonitorInput) (SLAMonitorOutput, error) { - maxDur, err := time.ParseDuration(input.MaxDuration) + breach, warning, err := pkgsla.CalculateRelativeDeadline( + input.SensorArrivalAt, input.MaxDuration, input.ExpectedDuration) if err != nil { - return SLAMonitorOutput{}, fmt.Errorf("parse maxDuration %q: %w", input.MaxDuration, err) - } - - arrivalAt, err := time.Parse(time.RFC3339, input.SensorArrivalAt) - if err != nil { - return SLAMonitorOutput{}, fmt.Errorf("parse sensorArrivalAt %q: %w", input.SensorArrivalAt, err) - } - - breachAt := arrivalAt.Add(maxDur) - - // Warning offset: use expectedDuration if provided, otherwise 25% of maxDuration. - var warningOffset time.Duration - if input.ExpectedDuration != "" { - warningOffset, err = time.ParseDuration(input.ExpectedDuration) - if err != nil { - return SLAMonitorOutput{}, fmt.Errorf("parse expectedDuration %q: %w", input.ExpectedDuration, err) - } - } else { - warningOffset = maxDur / 4 + return SLAMonitorOutput{}, err } - warningAt := breachAt.Add(-warningOffset) - return SLAMonitorOutput{ - WarningAt: warningAt.UTC().Format(time.RFC3339), - BreachAt: breachAt.UTC().Format(time.RFC3339), + WarningAt: warning.UTC().Format(time.RFC3339), + BreachAt: breach.UTC().Format(time.RFC3339), }, nil } @@ -182,7 +94,7 @@ func handleSLAFireAlert(ctx context.Context, d *Deps, input SLAMonitorInput) (SL d.Logger.InfoContext(ctx, "suppressing SLA alert (pipeline already finished)", "pipeline", input.PipelineID, "date", input.Date, "triggerStatus", tr.Status, "alertType", input.AlertType) suppressed = true - case isJobTerminal(ctx, d, input.PipelineID, input.ScheduleID, input.Date): + case IsJobTerminal(ctx, d, input.PipelineID, input.ScheduleID, input.Date): // Joblog fallback: trigger row may be nil (cron pipeline), RUNNING // (not yet updated), or TTL-expired. Check joblog as secondary signal. d.Logger.InfoContext(ctx, "suppressing SLA alert (terminal joblog event found)", @@ -190,16 +102,16 @@ func handleSLAFireAlert(ctx context.Context, d *Deps, input SLAMonitorInput) (SL suppressed = true } if suppressed { - return SLAMonitorOutput{AlertType: input.AlertType, FiredAt: d.now().UTC().Format(time.RFC3339)}, nil + return SLAMonitorOutput{AlertType: input.AlertType, FiredAt: d.Now().UTC().Format(time.RFC3339)}, nil } } if input.AlertType == "SLA_WARNING" && input.BreachAt != "" { breachAt, err := time.Parse(time.RFC3339, input.BreachAt) - if err == nil && !d.now().UTC().Before(breachAt) { + if err == nil && !d.Now().UTC().Before(breachAt) { d.Logger.InfoContext(ctx, "suppressing SLA_WARNING (past breach time)", "pipeline", input.PipelineID, "breachAt", input.BreachAt) - return SLAMonitorOutput{AlertType: input.AlertType, FiredAt: d.now().UTC().Format(time.RFC3339)}, nil + return SLAMonitorOutput{AlertType: input.AlertType, FiredAt: d.Now().UTC().Format(time.RFC3339)}, nil } } @@ -231,13 +143,13 @@ func handleSLAFireAlert(ctx context.Context, d *Deps, input SLAMonitorInput) (SL msg := fmt.Sprintf("pipeline %s: %s", input.PipelineID, input.AlertType) - if err := publishEvent(ctx, d, input.AlertType, input.PipelineID, input.ScheduleID, input.Date, msg, alertDetail); err != nil { + if err := PublishEvent(ctx, d, input.AlertType, input.PipelineID, input.ScheduleID, input.Date, msg, alertDetail); err != nil { return SLAMonitorOutput{}, fmt.Errorf("publish SLA event: %w", err) } return SLAMonitorOutput{ AlertType: input.AlertType, - FiredAt: d.now().UTC().Format(time.RFC3339), + FiredAt: d.Now().UTC().Format(time.RFC3339), }, nil } @@ -245,7 +157,7 @@ func handleSLAFireAlert(ctx context.Context, d *Deps, input SLAMonitorInput) (SL // SLA warning and breach times. Each schedule invokes this Lambda with // mode "fire-alert" at the exact timestamp, then auto-deletes. func handleSLASchedule(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMonitorOutput, error) { - calc, err := handleSLACalculate(input, d.now()) + calc, err := handleSLACalculate(input, d.Now()) if err != nil { return SLAMonitorOutput{}, fmt.Errorf("schedule: %w", err) } @@ -284,7 +196,7 @@ func handleSLACancel(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMo input.WarningAt = calc.WarningAt input.BreachAt = calc.BreachAt } else if input.Deadline != "" { - calc, err := handleSLACalculate(input, d.now()) + calc, err := handleSLACalculate(input, d.Now()) if err != nil { return SLAMonitorOutput{}, fmt.Errorf("cancel recalculate: %w", err) } @@ -313,7 +225,7 @@ func handleSLACancel(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMo // Determine final SLA status: binary MET or BREACH. // WARNING is not a valid completion outcome — if the job finished, it either // beat the breach deadline (MET) or missed it (BREACH). - now := d.now().UTC() + now := d.Now().UTC() alertType := string(types.EventSLAMet) if input.BreachAt != "" { breachAt, _ := time.Parse(time.RFC3339, input.BreachAt) @@ -343,7 +255,7 @@ func handleSLACancel(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMo "alertType", alertType, ) if publish { - if err := publishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date, + if err := PublishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil { return SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err) } @@ -394,6 +306,12 @@ func createOneTimeSchedule(ctx context.Context, d *Deps, name, timestamp string, return nil } +// CreateSLASchedules is the exported entry point for creating SLA schedules. +// Used by the watchdog sub-package for proactive SLA scheduling. +func CreateSLASchedules(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, calc SLAMonitorOutput, onConflictSkip bool) error { + return createSLASchedules(ctx, d, pipelineID, scheduleID, date, calc, onConflictSkip) +} + // createSLASchedules creates warning and breach one-time schedules. // Returns an error on the first schedule creation failure. If onConflictSkip // is true, ConflictException errors are silently skipped (idempotent retries). @@ -434,12 +352,12 @@ func createSLASchedules(ctx context.Context, d *Deps, pipelineID, scheduleID, da // that have already passed. Fallback for environments without EventBridge // Scheduler configured. func handleSLAReconcile(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMonitorOutput, error) { - calc, err := handleSLACalculate(input, d.now()) + calc, err := handleSLACalculate(input, d.Now()) if err != nil { return SLAMonitorOutput{}, fmt.Errorf("reconcile: %w", err) } - now := d.now().UTC() + now := d.Now().UTC() warningAt, _ := time.Parse(time.RFC3339, calc.WarningAt) breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) @@ -453,14 +371,14 @@ func handleSLAReconcile(ctx context.Context, d *Deps, input SLAMonitorInput) (SL var alertType string switch { case now.After(breachAt) || now.Equal(breachAt): - if err := publishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date, + if err := PublishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_BREACH", "error", err) } alertType = "SLA_BREACH" case now.After(warningAt) || now.Equal(warningAt): // Past warning but before breach — fire warning only - if err := publishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date, + if err := PublishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date, fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_WARNING", "error", err) } @@ -476,25 +394,3 @@ func handleSLAReconcile(ctx context.Context, d *Deps, input SLAMonitorInput) (SL FiredAt: now.Format(time.RFC3339), }, nil } - -// isJobTerminal checks the joblog for a terminal event (success, fail, timeout). -// Returns true if the pipeline has finished processing for the given date. -func isJobTerminal(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) bool { - rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date) - if err != nil { - d.Logger.WarnContext(ctx, "joblog lookup failed, not suppressing", - "pipeline", pipelineID, "error", err) - return false - } - if rec == nil { - return false - } - switch rec.Event { - case types.JobEventSuccess, types.JobEventFail, types.JobEventTimeout, - types.JobEventInfraTriggerExhausted, types.JobEventValidationExhausted, - types.JobEventJobPollExhausted: - return true - default: - return false - } -} diff --git a/internal/lambda/stream/dryrun.go b/internal/lambda/stream/dryrun.go new file mode 100644 index 0000000..facaa06 --- /dev/null +++ b/internal/lambda/stream/dryrun.go @@ -0,0 +1,396 @@ +package stream + +import ( + "context" + "fmt" + "math" + "strings" + "time" + + "github.com/aws/aws-lambda-go/events" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" +) + +// handleDryRunTrigger processes a sensor event for a dry-run pipeline. +func handleDryRunTrigger(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date string, now time.Time) error { + marker, err := d.Store.GetDryRunMarker(ctx, pipelineID, scheduleID, date) + if err != nil { + return fmt.Errorf("get dry-run marker for %q: %w", pipelineID, err) + } + + if marker != nil { + triggeredAtStr, ok := marker.Data["triggeredAt"].(string) + if !ok || triggeredAtStr == "" { + d.Logger.WarnContext(ctx, "dry-run marker missing triggeredAt", "pipelineId", pipelineID) + return nil + } + triggeredAt, parseErr := time.Parse(time.RFC3339, triggeredAtStr) + if parseErr != nil { + d.Logger.WarnContext(ctx, "dry-run marker has invalid triggeredAt", + "pipelineId", pipelineID, "value", triggeredAtStr, "error", parseErr) + return nil + } + lateBy := now.Sub(triggeredAt) + + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunLateData), pipelineID, scheduleID, date, + fmt.Sprintf("dry-run: late data arrived %.0fm after trigger point for %s", lateBy.Minutes(), pipelineID), + map[string]interface{}{ + "triggeredAt": triggeredAtStr, + "lateBy": lateBy.String(), + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunLateData, "error", pubErr) + } + return nil + } + + sensors, err := d.Store.GetAllSensors(ctx, pipelineID) + if err != nil { + return fmt.Errorf("get sensors for dry-run %q: %w", pipelineID, err) + } + lambda.RemapPerPeriodSensors(sensors, date) + + result := validation.EvaluateRules(cfg.Validation.Trigger, cfg.Validation.Rules, sensors, now) + if !result.Passed { + d.Logger.Info("dry-run: trigger condition met but validation rules not satisfied", + "pipelineId", pipelineID, "date", date) + return nil + } + + written, err := d.Store.WriteDryRunMarker(ctx, pipelineID, scheduleID, date, now) + if err != nil { + return fmt.Errorf("write dry-run marker for %q: %w", pipelineID, err) + } + if !written { + return nil + } + + if cfg.PostRun != nil && len(cfg.PostRun.Rules) > 0 { + if baselineErr := lambda.CapturePostRunBaseline(ctx, d, pipelineID, scheduleID, date); baselineErr != nil { + d.Logger.WarnContext(ctx, "dry-run: failed to capture post-run baseline", + "pipelineId", pipelineID, "error", baselineErr) + } + } + + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunWouldTrigger), pipelineID, scheduleID, date, + fmt.Sprintf("dry-run: would trigger %s at %s", pipelineID, now.Format(time.RFC3339)), + map[string]interface{}{ + "triggeredAt": now.UTC().Format(time.RFC3339), + "rulesEvaluated": len(cfg.Validation.Rules), + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldTrigger, "error", pubErr) + } + + var slaVerdict *dryRunSLAVerdict + if cfg.SLA != nil && cfg.SLA.ExpectedDuration != "" { + slaVerdict = publishDryRunSLAProjection(ctx, d, cfg, pipelineID, scheduleID, date, now) + } + + completedDetail := map[string]interface{}{ + "triggeredAt": now.UTC().Format(time.RFC3339), + } + if slaVerdict != nil { + completedDetail["slaStatus"] = slaVerdict.Status + completedDetail["estimatedCompletion"] = slaVerdict.EstimatedCompletion + if slaVerdict.Deadline != "" { + completedDetail["deadline"] = slaVerdict.Deadline + } + } else { + completedDetail["slaStatus"] = "n/a" + } + + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunCompleted), pipelineID, scheduleID, date, + fmt.Sprintf("dry-run: observation complete for %s/%s", pipelineID, date), + completedDetail); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunCompleted, "error", pubErr) + } + + d.Logger.Info("dry-run: would trigger", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date) + return nil +} + +// dryRunSLAVerdict holds the SLA projection result. +type dryRunSLAVerdict struct { + Status string + EstimatedCompletion string + Deadline string +} + +// publishDryRunSLAProjection computes and publishes an SLA projection event. +func publishDryRunSLAProjection(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date string, triggeredAt time.Time) *dryRunSLAVerdict { + expectedDur, err := time.ParseDuration(cfg.SLA.ExpectedDuration) + if err != nil { + d.Logger.WarnContext(ctx, "dry-run: invalid expectedDuration", "error", err) + return nil + } + + estimatedCompletion := triggeredAt.Add(expectedDur) + detail := map[string]interface{}{ + "triggeredAt": triggeredAt.UTC().Format(time.RFC3339), + "estimatedCompletion": estimatedCompletion.UTC().Format(time.RFC3339), + "expectedDuration": cfg.SLA.ExpectedDuration, + } + + verdict := &dryRunSLAVerdict{ + Status: "met", + EstimatedCompletion: estimatedCompletion.UTC().Format(time.RFC3339), + } + + message := fmt.Sprintf("dry-run: SLA projection for %s — estimated completion %s", + pipelineID, estimatedCompletion.Format(time.RFC3339)) + + if cfg.SLA.Deadline != "" { + slaInput := lambda.SLAMonitorInput{ + Mode: "calculate", + PipelineID: pipelineID, + ScheduleID: scheduleID, + Date: date, + Deadline: cfg.SLA.Deadline, + ExpectedDuration: cfg.SLA.ExpectedDuration, + Timezone: cfg.SLA.Timezone, + } + slaOutput, calcErr := lambda.HandleSLACalculate(slaInput, triggeredAt) + if calcErr != nil { + d.Logger.WarnContext(ctx, "dry-run: SLA deadline resolution failed", "error", calcErr) + } else if slaOutput.BreachAt != "" { + breachAt, parseErr := time.Parse(time.RFC3339, slaOutput.BreachAt) + if parseErr == nil { + detail["deadline"] = slaOutput.BreachAt + verdict.Deadline = slaOutput.BreachAt + margin := breachAt.Sub(estimatedCompletion) + detail["marginSeconds"] = margin.Seconds() + if estimatedCompletion.After(breachAt) { + verdict.Status = "breach" + message = fmt.Sprintf("dry-run: SLA projection for %s — would breach by %.0fm", + pipelineID, math.Abs(margin.Minutes())) + } else { + message = fmt.Sprintf("dry-run: SLA projection for %s — SLA met with %.0fm margin", + pipelineID, margin.Minutes()) + } + } + } + } + + detail["status"] = verdict.Status + + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunSLAProjection), pipelineID, scheduleID, date, message, detail); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunSLAProjection, "error", pubErr) + } + + return verdict +} + +// handleDryRunPostRunSensor handles post-run sensor events for dry-run pipelines. +func handleDryRunPostRunSensor(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, sensorKey string, sensorData map[string]interface{}) error { + scheduleID := lambda.ResolveScheduleID(cfg) + date := lambda.ResolveExecutionDate(sensorData, d.Now()) + + marker, err := d.Store.GetDryRunMarker(ctx, pipelineID, scheduleID, date) + if err != nil { + return fmt.Errorf("get dry-run marker for post-run %q: %w", pipelineID, err) + } + if marker == nil { + return nil + } + + baselineKey := "postrun-baseline#" + date + baseline, err := d.Store.GetSensorData(ctx, pipelineID, baselineKey) + if err != nil { + return fmt.Errorf("get baseline for dry-run post-run: %w", err) + } + if baseline == nil { + return nil + } + + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break + } + } + if ruleBaseline == nil { + return nil + } + + driftField := lambda.ResolveDriftField(cfg.PostRun) + threshold := 0.0 + if cfg.PostRun.DriftThreshold != nil { + threshold = *cfg.PostRun.DriftThreshold + } + dr := lambda.DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunDrift), pipelineID, scheduleID, date, + fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, dr.Previous, dr.Current), + map[string]interface{}{ + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, + "driftThreshold": threshold, + "driftField": driftField, + "sensorKey": sensorKey, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunDrift, "error", pubErr) + } + } + + return nil +} + +// handleDryRunRerunRequest evaluates all rerun checks for dry-run pipelines. +func handleDryRunRerunRequest(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string, record events.DynamoDBEventRecord) error { + if lambda.IsExcludedDate(cfg, date) { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: execution date %s excluded by calendar", pipelineID, date), + map[string]interface{}{ + "reason": "excluded by calendar", + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + + reason := "manual" + if img := record.Change.NewImage; img != nil { + if r, ok := img["reason"]; ok && r.DataType() == events.DataTypeString { + if v := r.String(); v != "" { + reason = v + } + } + } + + var budget int + var sources []string + switch reason { + case "data-drift", "late-data": + budget = types.IntOrDefault(cfg.Job.MaxDriftReruns, 1) + sources = []string{"data-drift", "late-data"} + default: + budget = types.IntOrDefault(cfg.Job.MaxManualReruns, 1) + sources = []string{reason} + } + + count, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, sources) + if err != nil { + return fmt.Errorf("dry-run: count reruns by source for %q: %w", pipelineID, err) + } + + if count >= budget { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: limit exceeded (%d/%d)", pipelineID, count, budget), + map[string]interface{}{ + "reason": "limit exceeded", + "rerunCount": count, + "budget": budget, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + + cbStatus := "passed" + job, err := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if err != nil { + return fmt.Errorf("dry-run: get latest job event for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + if job == nil { + cbStatus = "skipped (no job history)" + } else if job.Event == types.JobEventSuccess { + fresh, freshErr := checkSensorFreshness(ctx, d, pipelineID, job.SK) + if freshErr != nil { + return fmt.Errorf("dry-run: check sensor freshness for %q: %w", pipelineID, freshErr) + } + if !fresh { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("dry-run: rerun rejected for %s: previous run succeeded and no sensor data has changed", pipelineID), + map[string]interface{}{ + "reason": "circuit breaker", + "circuitBreaker": "rejected", + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr) + } + return nil + } + } + + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunWouldRerun), pipelineID, schedule, date, + fmt.Sprintf("dry-run: would rerun %s (reason: %s)", pipelineID, reason), + map[string]interface{}{ + "reason": reason, + "circuitBreaker": cbStatus, + "rerunCount": count, + "budget": budget, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRerun, "error", pubErr) + } + + d.Logger.Info("dry-run: would rerun", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "reason", reason) + return nil +} + +// handleDryRunJobFailure evaluates retry logic for a dry-run pipeline. +func handleDryRunJobFailure(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string) error { + maxRetries := cfg.Job.MaxRetries + + latestJob, jobErr := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if jobErr != nil { + d.Logger.WarnContext(ctx, "dry-run: could not read latest job event for failure category", + "pipelineId", pipelineID, "error", jobErr) + } + if latestJob != nil { + if types.FailureCategory(latestJob.Category) == types.FailurePermanent { + maxRetries = types.IntOrDefault(cfg.Job.MaxCodeRetries, 1) + } + } + + rerunCount, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, []string{"job-fail-retry"}) + if err != nil { + return fmt.Errorf("dry-run: count reruns for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + if rerunCount >= maxRetries { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + fmt.Sprintf("dry-run: retry limit reached (%d/%d) for %s", rerunCount, maxRetries, pipelineID), + map[string]interface{}{ + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr) + } + return nil + } + + if lambda.IsExcludedDate(cfg, date) { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date, + fmt.Sprintf("dry-run: retry skipped for %s: execution date %s excluded by calendar", pipelineID, date), + map[string]interface{}{ + "reason": "excluded by calendar", + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr) + } + return nil + } + + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventDryRunWouldRetry), pipelineID, schedule, date, + fmt.Sprintf("dry-run: would retry %s (%d/%d)", pipelineID, rerunCount, maxRetries), + map[string]interface{}{ + "retries": rerunCount, + "maxRetries": maxRetries, + }); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRetry, "error", pubErr) + } + + d.Logger.Info("dry-run: would retry", + "pipelineId", pipelineID, "schedule", schedule, "date", date, + "retries", rerunCount, "maxRetries", maxRetries) + return nil +} diff --git a/internal/lambda/stream/handler.go b/internal/lambda/stream/handler.go new file mode 100644 index 0000000..6044494 --- /dev/null +++ b/internal/lambda/stream/handler.go @@ -0,0 +1,58 @@ +// Package stream implements the DynamoDB stream-router Lambda handler. +// It processes stream events and routes each record to the appropriate +// handler based on the SK prefix. +package stream + +import ( + "context" + "fmt" + "strings" + + "github.com/aws/aws-lambda-go/events" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// HandleStreamEvent processes a DynamoDB stream event, routing each record +// to the appropriate handler based on the SK prefix. Per-record errors are +// collected as BatchItemFailures so the Lambda runtime can use DynamoDB's +// ReportBatchItemFailures to retry only the failed records. +func HandleStreamEvent(ctx context.Context, d *lambda.Deps, event lambda.StreamEvent) (events.DynamoDBEventResponse, error) { + var resp events.DynamoDBEventResponse + for i := range event.Records { + if err := handleRecord(ctx, d, event.Records[i]); err != nil { + d.Logger.Error("stream record error", + "error", err, + "eventID", event.Records[i].EventID, + ) + resp.BatchItemFailures = append(resp.BatchItemFailures, events.DynamoDBBatchItemFailure{ + ItemIdentifier: event.Records[i].EventID, + }) + } + } + return resp, nil +} + +// handleRecord extracts PK/SK and routes to the appropriate handler. +func handleRecord(ctx context.Context, d *lambda.Deps, record events.DynamoDBEventRecord) error { + pk, sk := lambda.ExtractKeys(record) + if pk == "" || sk == "" { + return fmt.Errorf("record missing PK or SK") + } + + switch { + case strings.HasPrefix(sk, "SENSOR#"): + return handleSensorEvent(ctx, d, pk, sk, record) + case sk == types.ConfigSK: + d.Logger.Info("config changed, invalidating cache", "pk", pk) + d.ConfigCache.Invalidate() + return nil + case strings.HasPrefix(sk, "JOB#"): + return handleJobLogEvent(ctx, d, pk, sk, record) + case strings.HasPrefix(sk, "RERUN_REQUEST#"): + return handleRerunRequest(ctx, d, pk, sk, record) + default: + return nil + } +} diff --git a/internal/lambda/stream/joblog.go b/internal/lambda/stream/joblog.go new file mode 100644 index 0000000..ab910ab --- /dev/null +++ b/internal/lambda/stream/joblog.go @@ -0,0 +1,62 @@ +package stream + +import ( + "context" + "fmt" + "strings" + + "github.com/aws/aws-lambda-go/events" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleJobLogEvent processes a JOB# stream record, routing to failure +// re-run logic or success notification based on the job event outcome. +func handleJobLogEvent(ctx context.Context, d *lambda.Deps, pk, sk string, record events.DynamoDBEventRecord) error { + pipelineID := strings.TrimPrefix(pk, "PIPELINE#") + if pipelineID == pk { + return fmt.Errorf("unexpected PK format: %q", pk) + } + + // Extract the "event" attribute from NewImage (success/fail/timeout). + eventAttr, ok := record.Change.NewImage["event"] + if !ok || eventAttr.DataType() != events.DataTypeString { + d.Logger.Warn("JOB record missing event attribute", "pk", pk, "sk", sk) + return nil + } + jobEvent := eventAttr.String() + + // Parse schedule and date from SK: JOB### + schedule, date, err := parseJobSK(sk) + if err != nil { + return err + } + + switch jobEvent { + case types.JobEventFail, types.JobEventTimeout: + return handleJobFailure(ctx, d, pipelineID, schedule, date, jobEvent) + case types.JobEventSuccess: + return handleJobSuccess(ctx, d, pipelineID, schedule, date) + default: + d.Logger.Warn("unknown job event", "event", jobEvent, "pipelineId", pipelineID) + return nil + } +} + +// parseJobSK extracts schedule and date from a JOB# sort key. +// Expected format: JOB### +func parseJobSK(sk string) (schedule, date string, err error) { + trimmed := strings.TrimPrefix(sk, "JOB#") + parts := strings.SplitN(trimmed, "#", 3) + if len(parts) < 3 { + return "", "", fmt.Errorf("invalid JOB SK format: %q", sk) + } + return parts[0], parts[1], nil +} + +// handleJobSuccess publishes a job-completed event to EventBridge. +func handleJobSuccess(ctx context.Context, d *lambda.Deps, pipelineID, schedule, date string) error { + return lambda.PublishEvent(ctx, d, string(types.EventJobCompleted), pipelineID, schedule, date, + fmt.Sprintf("job completed for %s", pipelineID)) +} diff --git a/internal/lambda/stream/postrun.go b/internal/lambda/stream/postrun.go new file mode 100644 index 0000000..7c037d2 --- /dev/null +++ b/internal/lambda/stream/postrun.go @@ -0,0 +1,170 @@ +package stream + +import ( + "context" + "fmt" + "strings" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" +) + +// handlePostRunSensorEvent evaluates post-run rules reactively when a sensor +// arrives via DynamoDB Stream. +func handlePostRunSensorEvent(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, sensorKey string, sensorData map[string]interface{}) error { + if cfg.DryRun { + return handleDryRunPostRunSensor(ctx, d, cfg, pipelineID, sensorKey, sensorData) + } + + scheduleID := lambda.ResolveScheduleID(cfg) + date := lambda.ResolveExecutionDate(sensorData, d.Now()) + + trigger, err := d.Store.GetTrigger(ctx, pipelineID, scheduleID, date) + if err != nil { + return fmt.Errorf("get trigger for post-run: %w", err) + } + if trigger == nil { + return nil + } + + switch trigger.Status { + case types.TriggerStatusRunning: + return handlePostRunInflight(ctx, d, cfg, pipelineID, scheduleID, date, sensorKey, sensorData) + case types.TriggerStatusCompleted: + return handlePostRunCompleted(ctx, d, cfg, pipelineID, scheduleID, date, sensorKey, sensorData) + default: + return nil + } +} + +// handlePostRunInflight evaluates post-run rules while the job is still running. +func handlePostRunInflight(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date, sensorKey string, sensorData map[string]interface{}) error { + baselineKey := "postrun-baseline#" + date + baseline, err := d.Store.GetSensorData(ctx, pipelineID, baselineKey) + if err != nil { + return fmt.Errorf("get baseline for inflight check: %w", err) + } + if baseline == nil { + return nil + } + + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break + } + } + if ruleBaseline == nil { + return nil + } + + driftField := lambda.ResolveDriftField(cfg.PostRun) + threshold := 0.0 + if cfg.PostRun.DriftThreshold != nil { + threshold = *cfg.PostRun.DriftThreshold + } + dr := lambda.DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { + if err := lambda.PublishEvent(ctx, d, string(types.EventPostRunDriftInflight), pipelineID, scheduleID, date, + fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, dr.Previous, dr.Current), + map[string]interface{}{ + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, + "driftThreshold": threshold, + "driftField": driftField, + "sensorKey": sensorKey, + "source": "post-run-stream", + }); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDriftInflight, "error", err) + } + } + return nil +} + +// handlePostRunCompleted evaluates post-run rules after the job has completed. +func handlePostRunCompleted(ctx context.Context, d *lambda.Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date, sensorKey string, sensorData map[string]interface{}) error { + baselineKey := "postrun-baseline#" + date + baseline, err := d.Store.GetSensorData(ctx, pipelineID, baselineKey) + if err != nil { + return fmt.Errorf("get baseline for post-run: %w", err) + } + + if baseline != nil { + var ruleBaseline map[string]interface{} + for _, rule := range cfg.PostRun.Rules { + if strings.HasPrefix(sensorKey, rule.Key) { + if nested, ok := baseline[rule.Key].(map[string]interface{}); ok { + ruleBaseline = nested + } + break + } + } + + if ruleBaseline != nil { + driftField := lambda.ResolveDriftField(cfg.PostRun) + threshold := 0.0 + if cfg.PostRun.DriftThreshold != nil { + threshold = *cfg.PostRun.DriftThreshold + } + dr := lambda.DetectDrift(ruleBaseline, sensorData, driftField, threshold) + if dr.Drifted { + if err := lambda.PublishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date, + fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, dr.Previous, dr.Current), + map[string]interface{}{ + "previousCount": dr.Previous, + "currentCount": dr.Current, + "delta": dr.Delta, + "driftThreshold": threshold, + "driftField": driftField, + "sensorKey": sensorKey, + "source": "post-run-stream", + }); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDrift, "error", err) + } + + if lambda.IsExcludedDate(cfg, date) { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date, + fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + } + d.Logger.InfoContext(ctx, "post-run drift rerun skipped: execution date excluded by calendar", + "pipelineId", pipelineID, "date", date) + } else { + if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, scheduleID, date, "data-drift"); writeErr != nil { + d.Logger.WarnContext(ctx, "failed to write rerun request on post-run drift", + "pipelineId", pipelineID, "error", writeErr) + } + } + return nil + } + } + } + + // Evaluate post-run validation rules. + sensors, err := d.Store.GetAllSensors(ctx, pipelineID) + if err != nil { + return fmt.Errorf("get sensors for post-run rules: %w", err) + } + lambda.RemapPerPeriodSensors(sensors, date) + + result := validation.EvaluateRules("ALL", cfg.PostRun.Rules, sensors, d.Now()) + + if result.Passed { + if err := lambda.PublishEvent(ctx, d, string(types.EventPostRunPassed), pipelineID, scheduleID, date, + fmt.Sprintf("post-run validation passed for %s", pipelineID)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunPassed, "error", err) + } + } else { + if err := lambda.PublishEvent(ctx, d, string(types.EventPostRunFailed), pipelineID, scheduleID, date, + fmt.Sprintf("post-run validation failed for %s", pipelineID)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunFailed, "error", err) + } + } + + return nil +} diff --git a/internal/lambda/stream/rerun.go b/internal/lambda/stream/rerun.go new file mode 100644 index 0000000..ec91c0b --- /dev/null +++ b/internal/lambda/stream/rerun.go @@ -0,0 +1,377 @@ +package stream + +import ( + "context" + "fmt" + "strconv" + "strings" + + "github.com/aws/aws-lambda-go/events" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// handleRerunRequest processes a RERUN_REQUEST# stream record. +func handleRerunRequest(ctx context.Context, d *lambda.Deps, pk, sk string, record events.DynamoDBEventRecord) error { + pipelineID := strings.TrimPrefix(pk, "PIPELINE#") + if pipelineID == pk { + return fmt.Errorf("unexpected PK format: %q", pk) + } + + schedule, date, err := parseRerunRequestSK(sk) + if err != nil { + return err + } + + cfg, err := lambda.GetValidatedConfig(ctx, d, pipelineID) + if err != nil { + return fmt.Errorf("load config for %q: %w", pipelineID, err) + } + if cfg == nil { + d.Logger.Warn("no config found for pipeline, skipping rerun request", "pipelineId", pipelineID) + return nil + } + + if cfg.DryRun { + return handleDryRunRerunRequest(ctx, d, cfg, pipelineID, schedule, date, record) + } + + // Calendar exclusion check (execution date). + if lambda.IsExcludedDate(cfg, date) { + if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, types.JobEventRerunRejected, "", 0, "excluded by calendar"); err != nil { + d.Logger.Warn("failed to write rerun-rejected joblog for calendar exclusion", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, schedule, date, + fmt.Sprintf("rerun blocked for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + } + return nil + } + + // Extract reason from stream record NewImage. Default to "manual". + reason := "manual" + if img := record.Change.NewImage; img != nil { + if r, ok := img["reason"]; ok && r.DataType() == events.DataTypeString { + if v := r.String(); v != "" { + reason = v + } + } + } + + // Rerun limit check. + var budget int + var sources []string + var limitLabel string + switch reason { + case "data-drift", "late-data": + budget = types.IntOrDefault(cfg.Job.MaxDriftReruns, 1) + sources = []string{"data-drift", "late-data"} + limitLabel = "drift rerun limit exceeded" + default: + budget = types.IntOrDefault(cfg.Job.MaxManualReruns, 1) + sources = []string{reason} + limitLabel = "manual rerun limit exceeded" + } + + count, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, sources) + if err != nil { + return fmt.Errorf("count reruns by source for %q: %w", pipelineID, err) + } + + if count >= budget { + if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, + types.JobEventRerunRejected, "", 0, limitLabel); err != nil { + d.Logger.Warn("failed to write rerun-rejected joblog for limit exceeded", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + if err := lambda.PublishEvent(ctx, d, string(types.EventRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("rerun rejected for %s: %s", pipelineID, limitLabel)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRerunRejected, "error", err) + } + d.Logger.Info("rerun request rejected (limit exceeded)", + "pipelineId", pipelineID, "schedule", schedule, "date", date, + "reason", reason, "count", count, "budget", budget) + return nil + } + + // Circuit breaker (sensor freshness). + job, err := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if err != nil { + return fmt.Errorf("get latest job event for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + allowed := true + rejectReason := "" + if job != nil && job.Event == types.JobEventSuccess { + fresh, err := checkSensorFreshness(ctx, d, pipelineID, job.SK) + if err != nil { + return fmt.Errorf("check sensor freshness for %q: %w", pipelineID, err) + } + if !fresh { + allowed = false + rejectReason = "previous run succeeded and no sensor data has changed" + } + } + + if !allowed { + if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, + types.JobEventRerunRejected, "", 0, rejectReason); err != nil { + d.Logger.Warn("failed to write rerun-rejected joblog for circuit breaker", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + if err := lambda.PublishEvent(ctx, d, string(types.EventRerunRejected), pipelineID, schedule, date, + fmt.Sprintf("rerun rejected for %s: %s", pipelineID, rejectReason)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRerunRejected, "error", err) + } + d.Logger.Info("rerun request rejected", + "pipelineId", pipelineID, "schedule", schedule, "date", date, + "reason", rejectReason) + return nil + } + + // Acceptance: acquire lock FIRST (before writing rerun). + acquired, err := d.Store.ResetTriggerLock(ctx, pipelineID, schedule, date, lambda.ResolveTriggerLockTTL()) + if err != nil { + return fmt.Errorf("reset trigger lock for %q: %w", pipelineID, err) + } + if !acquired { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventInfraFailure), pipelineID, schedule, date, + fmt.Sprintf("lock reset failed for rerun of %s", pipelineID)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "error", pubErr) + } + d.Logger.Warn("failed to reset trigger lock for rerun", + "pipelineId", pipelineID, "schedule", schedule, "date", date) + return nil + } + + // Delete date-scoped postrun-baseline so re-run captures fresh baseline. + if cfg.PostRun != nil { + if err := d.Store.DeleteSensor(ctx, pipelineID, "postrun-baseline#"+date); err != nil { + d.Logger.Warn("failed to delete postrun-baseline sensor", "error", err, "pipeline", pipelineID, "date", date) + } + } + + // Write rerun record AFTER lock is confirmed. + if _, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, reason, ""); err != nil { + if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil { + d.Logger.Warn("failed to release lock after rerun write failure", "error", relErr) + } + return fmt.Errorf("write rerun for %q: %w", pipelineID, err) + } + + if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, + types.JobEventRerunAccepted, "", 0, ""); err != nil { + d.Logger.Warn("failed to write rerun-accepted joblog", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventRerunAccepted), pipelineID, schedule, date, + fmt.Sprintf("rerun accepted for %s (reason: %s)", pipelineID, reason)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRerunAccepted, "error", pubErr) + } + + execName := lambda.TruncateExecName(fmt.Sprintf("%s-%s-%s-%s-rerun-%d", pipelineID, schedule, date, reason, d.Now().Unix())) + if err := lambda.StartSFNWithName(ctx, d, cfg, pipelineID, schedule, date, execName); err != nil { + if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil { + d.Logger.Warn("failed to release lock after SFN start failure", "error", relErr) + } + return fmt.Errorf("start SFN rerun for %q: %w", pipelineID, err) + } + + d.Logger.Info("started rerun", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "reason", reason) + return nil +} + +// parseRerunRequestSK extracts schedule and date from a RERUN_REQUEST# sort key. +func parseRerunRequestSK(sk string) (schedule, date string, err error) { + trimmed := strings.TrimPrefix(sk, "RERUN_REQUEST#") + parts := strings.SplitN(trimmed, "#", 2) + if len(parts) < 2 { + return "", "", fmt.Errorf("invalid RERUN_REQUEST SK format: %q", sk) + } + return parts[0], parts[1], nil +} + +// handleJobFailure processes a job failure or timeout. +func handleJobFailure(ctx context.Context, d *lambda.Deps, pipelineID, schedule, date, jobEvent string) error { + cfg, err := lambda.GetValidatedConfig(ctx, d, pipelineID) + if err != nil { + return fmt.Errorf("load config for %q: %w", pipelineID, err) + } + if cfg == nil { + d.Logger.Warn("no config found for pipeline, skipping rerun", "pipelineId", pipelineID) + return nil + } + + if cfg.DryRun { + return handleDryRunJobFailure(ctx, d, cfg, pipelineID, schedule, date) + } + + maxRetries := cfg.Job.MaxRetries + + latestJob, jobErr := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if jobErr != nil { + d.Logger.Warn("could not read latest job event for failure category", + "pipelineId", pipelineID, "error", jobErr) + } + if latestJob != nil { + if types.FailureCategory(latestJob.Category) == types.FailurePermanent { + maxRetries = types.IntOrDefault(cfg.Job.MaxCodeRetries, 1) + } + } + + rerunCount, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, []string{"job-fail-retry"}) + if err != nil { + return fmt.Errorf("count reruns for %q/%s/%s: %w", pipelineID, schedule, date, err) + } + + if rerunCount >= maxRetries { + if err := lambda.PublishEvent(ctx, d, string(types.EventRetryExhausted), pipelineID, schedule, date, + fmt.Sprintf("retry limit reached (%d/%d) for %s", rerunCount, maxRetries, pipelineID)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventRetryExhausted, "error", err) + } + + if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { + return fmt.Errorf("set trigger status FAILED_FINAL for %q: %w", pipelineID, err) + } + + d.Logger.Info("retry limit reached", + "pipelineId", pipelineID, "schedule", schedule, "date", date, + "reruns", rerunCount, "maxRetries", maxRetries) + return nil + } + + if lambda.IsExcludedDate(cfg, date) { + if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { + d.Logger.WarnContext(ctx, "failed to set trigger status after calendar exclusion", "error", err) + } + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, schedule, date, + fmt.Sprintf("job failure retry skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + } + return nil + } + + attempt, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, "job-fail-retry", jobEvent) + if err != nil { + return fmt.Errorf("write rerun for %q: %w", pipelineID, err) + } + + acquired, err := d.Store.ResetTriggerLock(ctx, pipelineID, schedule, date, lambda.ResolveTriggerLockTTL()) + if err != nil { + return fmt.Errorf("reset trigger lock for %q: %w", pipelineID, err) + } + if !acquired { + d.Logger.Warn("failed to reset trigger lock, skipping rerun", + "pipelineId", pipelineID, "schedule", schedule, "date", date) + return nil + } + + execName := lambda.TruncateExecName(fmt.Sprintf("%s-%s-%s-rerun-%d", pipelineID, schedule, date, attempt)) + if err := lambda.StartSFNWithName(ctx, d, cfg, pipelineID, schedule, date, execName); err != nil { + if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil { + d.Logger.Warn("failed to release lock after SFN start failure", "error", relErr) + } + return fmt.Errorf("start SFN rerun for %q: %w", pipelineID, err) + } + + d.Logger.Info("started rerun", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "attempt", attempt) + return nil +} + +// checkSensorFreshness determines whether any sensor data has been updated +// after the given job completed. +func checkSensorFreshness(ctx context.Context, d *lambda.Deps, pipelineID, jobSK string) (bool, error) { + parts := strings.Split(jobSK, "#") + if len(parts) < 4 { + return true, nil + } + jobTimestamp, err := strconv.ParseInt(parts[len(parts)-1], 10, 64) + if err != nil { + return true, nil + } + + sensors, err := d.Store.GetAllSensors(ctx, pipelineID) + if err != nil { + return false, fmt.Errorf("get sensors for %q: %w", pipelineID, err) + } + if len(sensors) == 0 { + return true, nil + } + + hasAnyUpdatedAt := false + for _, data := range sensors { + updatedAt, ok := data["updatedAt"] + if !ok { + continue + } + hasAnyUpdatedAt = true + + var ts int64 + switch v := updatedAt.(type) { + case float64: + ts = int64(v) + case int64: + ts = v + case string: + ts, err = strconv.ParseInt(v, 10, 64) + if err != nil { + continue + } + default: + continue + } + + if ts > 0 && ts < 1e12 { + ts *= 1000 + } + + if ts > jobTimestamp { + return true, nil + } + } + + if !hasAnyUpdatedAt { + return true, nil + } + + return false, nil +} + +// checkLateDataArrival detects sensor updates after a pipeline has completed. +func checkLateDataArrival(ctx context.Context, d *lambda.Deps, pipelineID, schedule, date string) error { + trigger, err := d.Store.GetTrigger(ctx, pipelineID, schedule, date) + if err != nil || trigger == nil { + return err + } + + if trigger.Status != types.TriggerStatusCompleted { + return nil + } + + job, err := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date) + if err != nil || job == nil { + return err + } + + if job.Event != types.JobEventSuccess { + return nil + } + + if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date, + types.JobEventLateDataArrival, "", 0, + "sensor updated after pipeline completed successfully"); err != nil { + d.Logger.Warn("failed to write late-data-arrival joblog", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + + if err := lambda.PublishEvent(ctx, d, string(types.EventLateDataArrival), pipelineID, schedule, date, + fmt.Sprintf("late data arrival for %s: sensor updated after job completion", pipelineID)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventLateDataArrival, "error", err) + } + + if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, schedule, date, "late-data"); writeErr != nil { + d.Logger.WarnContext(ctx, "failed to write rerun request on late data", "pipelineId", pipelineID, "error", writeErr) + } + + return nil +} diff --git a/internal/lambda/stream/sensor.go b/internal/lambda/stream/sensor.go new file mode 100644 index 0000000..f86c9fa --- /dev/null +++ b/internal/lambda/stream/sensor.go @@ -0,0 +1,146 @@ +package stream + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/aws/aws-lambda-go/events" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" +) + +// handleSensorEvent evaluates the trigger condition for a sensor write +// and starts the Step Function execution if all conditions are met. +func handleSensorEvent(ctx context.Context, d *lambda.Deps, pk, sk string, record events.DynamoDBEventRecord) error { + pipelineID := strings.TrimPrefix(pk, "PIPELINE#") + if pipelineID == pk { + return fmt.Errorf("unexpected PK format: %q", pk) + } + + cfg, err := lambda.GetValidatedConfig(ctx, d, pipelineID) + if err != nil { + return fmt.Errorf("load config for %q: %w", pipelineID, err) + } + if cfg == nil { + d.Logger.Warn("no config found for pipeline", "pipelineId", pipelineID) + return nil + } + + // Only process if the pipeline has a stream trigger condition. + trigger := cfg.Schedule.Trigger + if trigger == nil { + return nil + } + + // Check if this sensor key matches the trigger condition (prefix match + // allows per-period sensor keys like "hourly-status#2026-03-03T18"). + sensorKey := strings.TrimPrefix(sk, "SENSOR#") + if !strings.HasPrefix(sensorKey, trigger.Key) { + // Trigger key doesn't match — check if this sensor matches a post-run rule. + if cfg.PostRun != nil && lambda.MatchesPostRunRule(sensorKey, cfg.PostRun.Rules) { + sensorData := lambda.ExtractSensorData(record.Change.NewImage) + return handlePostRunSensorEvent(ctx, d, cfg, pipelineID, sensorKey, sensorData) + } + return nil + } + + // Extract sensor data from the stream record's NewImage. + sensorData := lambda.ExtractSensorData(record.Change.NewImage) + + // Capture current time once for consistent use across rule evaluation, + // calendar checks, and execution date resolution. + now := d.Now() + + // Build a validation rule from the trigger condition and evaluate it. + rule := types.ValidationRule{ + Key: trigger.Key, + Check: trigger.Check, + Field: trigger.Field, + Value: trigger.Value, + } + result := validation.EvaluateRule(rule, sensorData, now) + if !result.Passed { + d.Logger.Info("trigger condition not met", + "pipelineId", pipelineID, + "sensor", sensorKey, + "reason", result.Reason, + ) + return nil + } + + // Check calendar exclusions (wall-clock date). + if lambda.IsExcluded(cfg, now) { + d.Logger.Info("pipeline excluded by calendar", + "pipelineId", pipelineID, + "date", now.Format("2006-01-02"), + ) + scheduleIDForEvent := lambda.ResolveScheduleID(cfg) + dateForEvent := lambda.ResolveExecutionDate(sensorData, now) + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleIDForEvent, dateForEvent, + fmt.Sprintf("sensor trigger suppressed for %s: wall-clock date excluded by calendar", pipelineID)); pubErr != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) + } + return nil + } + + // Resolve schedule ID and date. + scheduleID := lambda.ResolveScheduleID(cfg) + date := lambda.ResolveExecutionDate(sensorData, now) + + // Dry-run mode: observe and record what would happen, but never start SFN. + if cfg.DryRun { + return handleDryRunTrigger(ctx, d, cfg, pipelineID, scheduleID, date, now) + } + + // Acquire trigger lock to prevent duplicate executions. + acquired, err := d.Store.AcquireTriggerLock(ctx, pipelineID, scheduleID, date, lambda.ResolveTriggerLockTTL()) + if err != nil { + return fmt.Errorf("acquire trigger lock for %q: %w", pipelineID, err) + } + if !acquired { + // Check if this is late data arriving after a completed pipeline. + if err := checkLateDataArrival(ctx, d, pipelineID, scheduleID, date); err != nil { + d.Logger.WarnContext(ctx, "late data check failed", "error", err) + } + d.Logger.InfoContext(ctx, "trigger lock already held", + "pipelineId", pipelineID, + "schedule", scheduleID, + "date", date, + ) + return nil + } + + // Record first sensor arrival time (idempotent — only writes if absent). + // This timestamp serves as T=0 for relative SLA calculation. + arrivalKey := "first-sensor-arrival#" + date + if _, writeErr := d.Store.WriteSensorIfAbsent(ctx, pipelineID, arrivalKey, map[string]interface{}{ + "arrivedAt": now.UTC().Format(time.RFC3339), + }); writeErr != nil { + d.Logger.WarnContext(ctx, "failed to write first-sensor-arrival", + "pipelineId", pipelineID, "date", date, "error", writeErr) + } + + // Start Step Function execution. + if err := lambda.StartSFN(ctx, d, cfg, pipelineID, scheduleID, date); err != nil { + if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, scheduleID, date); relErr != nil { + d.Logger.Warn("failed to release lock after SFN start failure", "error", relErr) + } + return fmt.Errorf("start SFN for %q: %w", pipelineID, err) + } + + if err := lambda.PublishEvent(ctx, d, string(types.EventJobTriggered), pipelineID, scheduleID, date, + fmt.Sprintf("stream trigger fired for %s", pipelineID)); err != nil { + d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobTriggered, "error", err) + } + + d.Logger.Info("started step function execution", + "pipelineId", pipelineID, + "schedule", scheduleID, + "date", date, + ) + return nil +} diff --git a/internal/lambda/stream_router.go b/internal/lambda/stream_router.go index 0344928..22b92d9 100644 --- a/internal/lambda/stream_router.go +++ b/internal/lambda/stream_router.go @@ -3,56 +3,15 @@ package lambda import ( "context" "fmt" - "os" - "strconv" "strings" "time" "github.com/aws/aws-lambda-go/events" - "github.com/dwsmith1983/interlock/internal/validation" "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" ) -// ResolveTriggerLockTTL returns the trigger lock TTL based on the -// SFN_TIMEOUT_SECONDS env var plus a 30-minute buffer. Defaults to -// 4h30m if the env var is not set or invalid. -func ResolveTriggerLockTTL() time.Duration { - s := os.Getenv("SFN_TIMEOUT_SECONDS") - if s == "" { - return DefaultTriggerLockTTL - } - sec, err := strconv.Atoi(s) - if err != nil || sec <= 0 { - return DefaultTriggerLockTTL - } - return time.Duration(sec)*time.Second + TriggerLockBuffer -} - -// getValidatedConfig loads a pipeline config and validates its retry/timeout -// fields. Returns nil (with a warning log) if validation fails, signalling the -// caller to skip processing for this pipeline. -func getValidatedConfig(ctx context.Context, d *Deps, pipelineID string) (*types.PipelineConfig, error) { - cfg, err := d.ConfigCache.Get(ctx, pipelineID) - if err != nil { - return nil, err - } - if cfg == nil { - return nil, nil - } - if errs := validation.ValidatePipelineConfig(cfg); len(errs) > 0 { - d.Logger.Warn("invalid pipeline config, skipping", - "pipelineId", pipelineID, - "errors", errs, - ) - return nil, nil - } - return cfg, nil -} - -// HandleStreamEvent processes a DynamoDB stream event, routing each record -// to the appropriate handler based on the SK prefix. Per-record errors are -// collected as BatchItemFailures so the Lambda runtime can use DynamoDB's -// ReportBatchItemFailures to retry only the failed records. +// Deprecated: Use stream.HandleStreamEvent instead. Retained for test compatibility. func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) (events.DynamoDBEventResponse, error) { var resp events.DynamoDBEventResponse for i := range event.Records { @@ -71,7 +30,7 @@ func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) (events. // handleRecord extracts PK/SK and routes to the appropriate handler. func handleRecord(ctx context.Context, d *Deps, record events.DynamoDBEventRecord) error { - pk, sk := extractKeys(record) + pk, sk := ExtractKeys(record) if pk == "" || sk == "" { return fmt.Errorf("record missing PK or SK") } @@ -138,7 +97,7 @@ func parseJobSK(sk string) (schedule, date string, err error) { // handleJobSuccess publishes a job-completed event to EventBridge. func handleJobSuccess(ctx context.Context, d *Deps, pipelineID, schedule, date string) error { - return publishEvent(ctx, d, string(types.EventJobCompleted), pipelineID, schedule, date, + return PublishEvent(ctx, d, string(types.EventJobCompleted), pipelineID, schedule, date, fmt.Sprintf("job completed for %s", pipelineID)) } @@ -150,7 +109,7 @@ func handleSensorEvent(ctx context.Context, d *Deps, pk, sk string, record event return fmt.Errorf("unexpected PK format: %q", pk) } - cfg, err := getValidatedConfig(ctx, d, pipelineID) + cfg, err := GetValidatedConfig(ctx, d, pipelineID) if err != nil { return fmt.Errorf("load config for %q: %w", pipelineID, err) } @@ -170,19 +129,19 @@ func handleSensorEvent(ctx context.Context, d *Deps, pk, sk string, record event sensorKey := strings.TrimPrefix(sk, "SENSOR#") if !strings.HasPrefix(sensorKey, trigger.Key) { // Trigger key doesn't match — check if this sensor matches a post-run rule. - if cfg.PostRun != nil && matchesPostRunRule(sensorKey, cfg.PostRun.Rules) { - sensorData := extractSensorData(record.Change.NewImage) + if cfg.PostRun != nil && MatchesPostRunRule(sensorKey, cfg.PostRun.Rules) { + sensorData := ExtractSensorData(record.Change.NewImage) return handlePostRunSensorEvent(ctx, d, cfg, pipelineID, sensorKey, sensorData) } return nil } // Extract sensor data from the stream record's NewImage. - sensorData := extractSensorData(record.Change.NewImage) + sensorData := ExtractSensorData(record.Change.NewImage) // Capture current time once for consistent use across rule evaluation, // calendar checks, and execution date resolution. - now := d.now() + now := d.Now() // Build a validation rule from the trigger condition and evaluate it. rule := types.ValidationRule{ @@ -202,14 +161,14 @@ func handleSensorEvent(ctx context.Context, d *Deps, pk, sk string, record event } // Check calendar exclusions (wall-clock date). - if isExcluded(cfg, now) { + if IsExcluded(cfg, now) { d.Logger.Info("pipeline excluded by calendar", "pipelineId", pipelineID, "date", now.Format("2006-01-02"), ) - scheduleIDForEvent := resolveScheduleID(cfg) + scheduleIDForEvent := ResolveScheduleID(cfg) dateForEvent := ResolveExecutionDate(sensorData, now) - if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleIDForEvent, dateForEvent, + if pubErr := PublishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleIDForEvent, dateForEvent, fmt.Sprintf("sensor trigger suppressed for %s: wall-clock date excluded by calendar", pipelineID)); pubErr != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr) } @@ -217,7 +176,7 @@ func handleSensorEvent(ctx context.Context, d *Deps, pk, sk string, record event } // Resolve schedule ID and date. - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) date := ResolveExecutionDate(sensorData, now) // Dry-run mode: observe and record what would happen, but never start SFN. @@ -254,14 +213,14 @@ func handleSensorEvent(ctx context.Context, d *Deps, pk, sk string, record event } // Start Step Function execution. - if err := startSFN(ctx, d, cfg, pipelineID, scheduleID, date); err != nil { + if err := StartSFN(ctx, d, cfg, pipelineID, scheduleID, date); err != nil { if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, scheduleID, date); relErr != nil { d.Logger.Warn("failed to release lock after SFN start failure", "error", relErr) } return fmt.Errorf("start SFN for %q: %w", pipelineID, err) } - if err := publishEvent(ctx, d, string(types.EventJobTriggered), pipelineID, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventJobTriggered), pipelineID, scheduleID, date, fmt.Sprintf("stream trigger fired for %s", pipelineID)); err != nil { d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventJobTriggered, "error", err) } diff --git a/internal/lambda/terminal.go b/internal/lambda/terminal.go new file mode 100644 index 0000000..643bcd8 --- /dev/null +++ b/internal/lambda/terminal.go @@ -0,0 +1,29 @@ +package lambda + +import ( + "context" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// IsJobTerminal checks the joblog for a terminal event (success, fail, timeout). +// Returns true if the pipeline has finished processing for the given date. +func IsJobTerminal(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) bool { + rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date) + if err != nil { + d.Logger.WarnContext(ctx, "joblog lookup failed, not suppressing", + "pipeline", pipelineID, "error", err) + return false + } + if rec == nil { + return false + } + switch rec.Event { + case types.JobEventSuccess, types.JobEventFail, types.JobEventTimeout, + types.JobEventInfraTriggerExhausted, types.JobEventValidationExhausted, + types.JobEventJobPollExhausted: + return true + default: + return false + } +} diff --git a/internal/lambda/trigger_registry.go b/internal/lambda/trigger_registry.go new file mode 100644 index 0000000..9eb20bf --- /dev/null +++ b/internal/lambda/trigger_registry.go @@ -0,0 +1,38 @@ +package lambda + +import ( + "encoding/json" + + "github.com/dwsmith1983/interlock/pkg/types" +) + +// TriggerUnmarshalers maps each trigger type to a function that unmarshals +// raw JSON into the corresponding typed field on TriggerConfig. +// Exported so that sub-packages (e.g. orchestrator) can reuse the single +// canonical registry without duplicating it. +var TriggerUnmarshalers = map[types.TriggerType]func([]byte, *types.TriggerConfig) error{ + types.TriggerHTTP: UnmarshalTo(func(tc *types.TriggerConfig, c *types.HTTPTriggerConfig) { tc.HTTP = c }), + types.TriggerCommand: UnmarshalTo(func(tc *types.TriggerConfig, c *types.CommandTriggerConfig) { tc.Command = c }), + types.TriggerAirflow: UnmarshalTo(func(tc *types.TriggerConfig, c *types.AirflowTriggerConfig) { tc.Airflow = c }), + types.TriggerGlue: UnmarshalTo(func(tc *types.TriggerConfig, c *types.GlueTriggerConfig) { tc.Glue = c }), + types.TriggerEMR: UnmarshalTo(func(tc *types.TriggerConfig, c *types.EMRTriggerConfig) { tc.EMR = c }), + types.TriggerEMRServerless: UnmarshalTo(func(tc *types.TriggerConfig, c *types.EMRServerlessTriggerConfig) { tc.EMRServerless = c }), + types.TriggerStepFunction: UnmarshalTo(func(tc *types.TriggerConfig, c *types.StepFunctionTriggerConfig) { tc.StepFunction = c }), + types.TriggerDatabricks: UnmarshalTo(func(tc *types.TriggerConfig, c *types.DatabricksTriggerConfig) { tc.Databricks = c }), + types.TriggerLambda: UnmarshalTo(func(tc *types.TriggerConfig, c *types.LambdaTriggerConfig) { tc.Lambda = c }), +} + +// UnmarshalTo returns an unmarshaler that decodes JSON into a typed config +// struct and assigns it to the appropriate TriggerConfig field. +// Exported so that sub-packages can reference it if they need to extend the +// registry with additional trigger types. +func UnmarshalTo[T any](assign func(*types.TriggerConfig, *T)) func([]byte, *types.TriggerConfig) error { + return func(data []byte, tc *types.TriggerConfig) error { + var c T + if err := json.Unmarshal(data, &c); err != nil { + return err + } + assign(tc, &c) + return nil + } +} diff --git a/internal/lambda/watchdog.go b/internal/lambda/watchdog.go index 4d0658c..08c4b38 100644 --- a/internal/lambda/watchdog.go +++ b/internal/lambda/watchdog.go @@ -1,34 +1,46 @@ package lambda -import "context" +import ( + "context" + "errors" + "fmt" + "strings" -// HandleWatchdog runs periodic health checks. It detects stale trigger -// executions (Step Function timeouts) and missed cron schedules. Errors from -// each check are logged but do not prevent the other check from running. + "github.com/dwsmith1983/interlock/pkg/types" +) + +// Deprecated: Use watchdog.HandleWatchdog instead. Retained for test compatibility. func HandleWatchdog(ctx context.Context, d *Deps) error { - if err := detectStaleTriggers(ctx, d); err != nil { - d.Logger.Error("stale trigger detection failed", "error", err) - } - if err := detectMissedSchedules(ctx, d); err != nil { - d.Logger.Error("missed schedule detection failed", "error", err) - } - if err := detectMissedInclusionSchedules(ctx, d); err != nil { - d.Logger.Error("missed inclusion schedule detection failed", "error", err) - } - if err := reconcileSensorTriggers(ctx, d); err != nil { - d.Logger.Error("sensor trigger reconciliation failed", "error", err) + checks := []struct { + name string + fn func(context.Context, *Deps) error + }{ + {"stale-triggers", detectStaleTriggers}, + {"missed-schedules", detectMissedSchedules}, + {"missed-inclusion-schedules", detectMissedInclusionSchedules}, + {"sensor-trigger-reconciliation", reconcileSensorTriggers}, + {"sla-scheduling", scheduleSLAAlerts}, + {"trigger-deadlines", checkTriggerDeadlines}, + {"post-run-sensors", detectMissingPostRunSensors}, + {"relative-sla-breaches", detectRelativeSLABreaches}, } - if err := scheduleSLAAlerts(ctx, d); err != nil { - d.Logger.Error("proactive SLA scheduling failed", "error", err) - } - if err := checkTriggerDeadlines(ctx, d); err != nil { - d.Logger.Error("trigger deadline check failed", "error", err) - } - if err := detectMissingPostRunSensors(ctx, d); err != nil { - d.Logger.Error("post-run sensor absence detection failed", "error", err) + + var errs []error + var failed []string + for _, c := range checks { + if err := c.fn(ctx, d); err != nil { + d.Logger.Error(c.name+" failed", "error", err) + errs = append(errs, fmt.Errorf("%s: %w", c.name, err)) + failed = append(failed, c.name) + } } - if err := detectRelativeSLABreaches(ctx, d); err != nil { - d.Logger.Error("relative SLA breach detection failed", "error", err) + + if len(failed) > 0 { + if pubErr := PublishEvent(ctx, d, string(types.EventWatchdogDegraded), "", "", "", + fmt.Sprintf("watchdog checks failed: %s", strings.Join(failed, ", "))); pubErr != nil { + d.Logger.Error("failed to publish watchdog degraded event", "error", pubErr) + } + return errors.Join(errs...) } return nil } diff --git a/internal/lambda/watchdog/handler.go b/internal/lambda/watchdog/handler.go new file mode 100644 index 0000000..bde5472 --- /dev/null +++ b/internal/lambda/watchdog/handler.go @@ -0,0 +1,50 @@ +// Package watchdog implements the periodic health-check Lambda handler. +// It detects stale triggers, missed schedules, missing post-run sensors, +// and SLA breaches. +package watchdog + +import ( + "context" + "errors" + "fmt" + "strings" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// HandleWatchdog runs periodic health checks. +func HandleWatchdog(ctx context.Context, d *lambda.Deps) error { + checks := []struct { + name string + fn func(context.Context, *lambda.Deps) error + }{ + {"stale-triggers", detectStaleTriggers}, + {"missed-schedules", detectMissedSchedules}, + {"missed-inclusion-schedules", detectMissedInclusionSchedules}, + {"sensor-trigger-reconciliation", reconcileSensorTriggers}, + {"sla-scheduling", scheduleSLAAlerts}, + {"trigger-deadlines", checkTriggerDeadlines}, + {"post-run-sensors", detectMissingPostRunSensors}, + {"relative-sla-breaches", detectRelativeSLABreaches}, + } + + var errs []error + var failed []string + for _, c := range checks { + if err := c.fn(ctx, d); err != nil { + d.Logger.Error(c.name+" failed", "error", err) + errs = append(errs, fmt.Errorf("%s: %w", c.name, err)) + failed = append(failed, c.name) + } + } + + if len(failed) > 0 { + if pubErr := lambda.PublishEvent(ctx, d, string(types.EventWatchdogDegraded), "", "", "", + fmt.Sprintf("watchdog checks failed: %s", strings.Join(failed, ", "))); pubErr != nil { + d.Logger.Error("failed to publish watchdog degraded event", "error", pubErr) + } + return errors.Join(errs...) + } + return nil +} diff --git a/internal/lambda/watchdog/missed.go b/internal/lambda/watchdog/missed.go new file mode 100644 index 0000000..557eed6 --- /dev/null +++ b/internal/lambda/watchdog/missed.go @@ -0,0 +1,199 @@ +package watchdog + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// lastCronFire returns the most recent expected fire time for a cron expression. +func lastCronFire(cron string, now time.Time, loc *time.Location) time.Time { + fields := strings.Fields(cron) + if len(fields) < 5 { + return time.Time{} + } + minute, err := strconv.Atoi(fields[0]) + if err != nil { + return time.Time{} + } + localNow := now.In(loc) + + if fields[1] == "*" { + candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), + localNow.Hour(), minute, 0, 0, loc) + if candidate.After(localNow) { + candidate = candidate.Add(-time.Hour) + } + return candidate + } + + hour, err := strconv.Atoi(fields[1]) + if err != nil { + return time.Time{} + } + candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(), + hour, minute, 0, 0, loc) + if candidate.After(localNow) { + candidate = candidate.Add(-24 * time.Hour) + } + return candidate +} + +// detectMissedSchedules checks all cron-scheduled pipelines. +func detectMissedSchedules(ctx context.Context, d *lambda.Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.Now() + today := now.Format("2006-01-02") + + for id, cfg := range configs { + if cfg.Schedule.Cron == "" { + continue + } + + if cfg.DryRun { + continue + } + + if lambda.IsExcluded(cfg, now) { + continue + } + + if !d.StartedAt.IsZero() { + loc := lambda.ResolveTimezone(cfg.Schedule.Timezone) + if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) { + continue + } + } + + scheduleID := lambda.ResolveScheduleID(cfg) + + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, today) + if err != nil { + d.Logger.Error("failed to check trigger for missed schedule", + "pipelineId", id, "error", err) + continue + } + if found { + continue + } + + if cfg.Schedule.Time != "" { + loc := lambda.ResolveTimezone(cfg.Schedule.Timezone) + localNow := now.In(loc) + expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc) + if err == nil && localNow.Before(expectedStart) { + continue + } + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "cron": cfg.Schedule.Cron, + "actionHint": fmt.Sprintf("cron %s expected to fire — no trigger found", cfg.Schedule.Cron), + } + if cfg.Schedule.Time != "" { + alertDetail["expectedTime"] = cfg.Schedule.Time + } + if err := lambda.PublishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today, + fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil { + d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) + } + + d.Logger.Info("detected missed schedule", + "pipelineId", id, "schedule", scheduleID, "date", today) + } + return nil +} + +// detectMissedInclusionSchedules checks pipelines with inclusion calendar config. +func detectMissedInclusionSchedules(ctx context.Context, d *lambda.Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.Now() + + for id, cfg := range configs { + if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 { + continue + } + + if cfg.DryRun { + continue + } + + if lambda.IsExcluded(cfg, now) { + continue + } + + pastDates := lambda.PastInclusionDates(cfg.Schedule.Include.Dates, now) + if len(pastDates) == 0 { + continue + } + + scheduleID := lambda.ResolveScheduleID(cfg) + + tzLoc := lambda.ResolveTimezone(cfg.Schedule.Timezone) + today := now.In(tzLoc).Format("2006-01-02") + + for _, date := range pastDates { + if cfg.Schedule.Time != "" && date == today { + localNow := now.In(tzLoc) + expectedStart, err := time.ParseInLocation("2006-01-02 15:04", date+" "+cfg.Schedule.Time, tzLoc) + if err == nil && localNow.Before(expectedStart) { + continue + } + } + + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) + if err != nil { + d.Logger.Error("failed to check trigger for inclusion schedule", + "pipelineId", id, "date", date, "error", err) + continue + } + if found { + continue + } + + dedupKey := "irregular-missed-check#" + date + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed for inclusion schedule", + "pipelineId", id, "date", date, "error", err) + continue + } + if dedupData != nil { + continue + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date), + } + if err := lambda.PublishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date, + fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date) + } + + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write inclusion dedup marker", "error", err, "pipeline", id, "date", date) + } + + d.Logger.Info("detected missed inclusion schedule", + "pipelineId", id, "schedule", scheduleID, "date", date) + } + } + return nil +} diff --git a/internal/lambda/watchdog/postrun.go b/internal/lambda/watchdog/postrun.go new file mode 100644 index 0000000..8c75460 --- /dev/null +++ b/internal/lambda/watchdog/postrun.go @@ -0,0 +1,190 @@ +package watchdog + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +const defaultSensorTimeout = 2 * time.Hour + +// detectMissingPostRunSensors checks pipelines with PostRun config for missing +// post-run sensor data. +func detectMissingPostRunSensors(ctx context.Context, d *lambda.Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.Now() + today := now.Format("2006-01-02") + + for id, cfg := range configs { + if cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 { + continue + } + + if cfg.DryRun { + continue + } + + scheduleID := lambda.ResolveScheduleID(cfg) + + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today) + if err != nil { + d.Logger.Error("trigger lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if tr == nil || tr.Status != types.TriggerStatusCompleted { + continue + } + + baselineKey := "postrun-baseline#" + today + baseline, err := d.Store.GetSensorData(ctx, id, baselineKey) + if err != nil { + d.Logger.Error("baseline lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if baseline == nil { + continue + } + + dedupKey := "postrun-check#" + today + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + if dedupData != nil { + continue + } + + completionTime, err := resolveCompletionTime(ctx, d, id, scheduleID, today) + if err != nil { + d.Logger.Error("completion time resolution failed", + "pipelineId", id, "error", err) + continue + } + if completionTime.IsZero() { + continue + } + + timeout := parseSensorTimeout(cfg.PostRun.SensorTimeout) + + if now.Before(completionTime.Add(timeout)) { + continue + } + + sensors, err := d.Store.GetAllSensors(ctx, id) + if err != nil { + d.Logger.Error("sensor lookup failed in post-run sensor check", + "pipelineId", id, "error", err) + continue + } + + if hasPostRunSensorUpdate(cfg.PostRun.Rules, sensors, completionTime) { + continue + } + + ruleKeys := make([]string, 0, len(cfg.PostRun.Rules)) + for _, r := range cfg.PostRun.Rules { + ruleKeys = append(ruleKeys, r.Key) + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "sensorTimeout": cfg.PostRun.SensorTimeout, + "ruleKeys": strings.Join(ruleKeys, ", "), + "actionHint": "post-run sensor data has not arrived within the expected timeout", + } + if err := lambda.PublishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today, + fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil { + d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) + } + + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write post-run dedup marker", "error", err, "pipeline", id, "date", today) + } + + d.Logger.Info("detected missing post-run sensor", + "pipelineId", id, "schedule", scheduleID, "date", today) + } + return nil +} + +func resolveCompletionTime(ctx context.Context, d *lambda.Deps, pipelineID, scheduleID, date string) (time.Time, error) { + rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date) + if err != nil { + return time.Time{}, fmt.Errorf("get latest job event: %w", err) + } + if rec == nil { + return time.Time{}, nil + } + if rec.Event != types.JobEventSuccess { + return time.Time{}, nil + } + + parts := strings.Split(rec.SK, "#") + if len(parts) < 4 { + return time.Time{}, fmt.Errorf("unexpected job SK format: %q", rec.SK) + } + tsMillis, err := strconv.ParseInt(parts[len(parts)-1], 10, 64) + if err != nil { + return time.Time{}, fmt.Errorf("parse job timestamp %q: %w", parts[len(parts)-1], err) + } + return time.UnixMilli(tsMillis), nil +} + +func parseSensorTimeout(s string) time.Duration { + if s == "" { + return defaultSensorTimeout + } + d, err := time.ParseDuration(s) + if err != nil { + return defaultSensorTimeout + } + return d +} + +func hasPostRunSensorUpdate(rules []types.ValidationRule, sensors map[string]map[string]interface{}, completionTime time.Time) bool { + completionMillis := completionTime.UnixMilli() + + for _, rule := range rules { + data, ok := sensors[rule.Key] + if !ok { + continue + } + + updatedAt, ok := data["updatedAt"] + if !ok { + continue + } + + var ts int64 + switch v := updatedAt.(type) { + case float64: + ts = int64(v) + case int64: + ts = v + case string: + ts, _ = strconv.ParseInt(v, 10, 64) + default: + continue + } + + if ts > completionMillis { + return true + } + } + return false +} diff --git a/internal/lambda/watchdog/relative_sla.go b/internal/lambda/watchdog/relative_sla.go new file mode 100644 index 0000000..6258db5 --- /dev/null +++ b/internal/lambda/watchdog/relative_sla.go @@ -0,0 +1,125 @@ +package watchdog + +import ( + "context" + "fmt" + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// detectRelativeSLABreaches checks pipelines with MaxDuration SLA config. +func detectRelativeSLABreaches(ctx context.Context, d *lambda.Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.Now() + datesToCheck := []string{ + now.Format("2006-01-02"), + now.AddDate(0, 0, -1).Format("2006-01-02"), + } + + for id, cfg := range configs { + if cfg.SLA == nil || cfg.SLA.MaxDuration == "" { + continue + } + + if cfg.DryRun { + continue + } + + maxDur, err := time.ParseDuration(cfg.SLA.MaxDuration) + if err != nil { + d.Logger.Warn("invalid maxDuration in SLA config", + "pipelineId", id, "maxDuration", cfg.SLA.MaxDuration, "error", err) + continue + } + + scheduleID := lambda.ResolveScheduleID(cfg) + + for _, checkDate := range datesToCheck { + checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now) + } + } + return nil +} + +func checkRelativeSLAForDate(ctx context.Context, d *lambda.Deps, id string, cfg *types.PipelineConfig, scheduleID, checkDate string, maxDur time.Duration, now time.Time) { + arrivalKey := "first-sensor-arrival#" + checkDate + arrivalData, err := d.Store.GetSensorData(ctx, id, arrivalKey) + if err != nil { + d.Logger.Error("first-sensor-arrival lookup failed", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if arrivalData == nil { + return + } + + arrivedAtStr, ok := arrivalData["arrivedAt"].(string) + if !ok || arrivedAtStr == "" { + return + } + arrivedAt, err := time.Parse(time.RFC3339, arrivedAtStr) + if err != nil { + d.Logger.Warn("invalid arrivedAt in first-sensor-arrival", + "pipelineId", id, "arrivedAt", arrivedAtStr, "error", err) + return + } + + breachAt := arrivedAt.Add(maxDur) + if now.Before(breachAt) { + return + } + + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, checkDate) + if err != nil { + d.Logger.Warn("trigger lookup failed in relative SLA check", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) { + return + } + if lambda.IsJobTerminal(ctx, d, id, scheduleID, checkDate) { + return + } + + dedupKey := "relative-sla-breach-check#" + checkDate + dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey) + if err != nil { + d.Logger.Error("dedup marker lookup failed for relative SLA breach", + "pipelineId", id, "date", checkDate, "error", err) + return + } + if dedupData != nil { + return + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "maxDuration": cfg.SLA.MaxDuration, + "sensorArrivalAt": arrivedAtStr, + "breachAt": breachAt.UTC().Format(time.RFC3339), + "actionHint": "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival", + } + if err := lambda.PublishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate, + fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil { + d.Logger.Warn("failed to publish relative SLA breach event", + "error", err, "pipeline", id, "date", checkDate) + } + + if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{ + "alerted": "true", + }); err != nil { + d.Logger.Warn("failed to write relative SLA breach dedup marker", + "error", err, "pipeline", id, "date", checkDate) + } + + d.Logger.Info("detected relative SLA breach", + "pipelineId", id, "schedule", scheduleID, "date", checkDate, + "sensorArrivalAt", arrivedAtStr, "breachAt", breachAt.UTC().Format(time.RFC3339)) +} diff --git a/internal/lambda/watchdog/sla.go b/internal/lambda/watchdog/sla.go new file mode 100644 index 0000000..1f3a4a9 --- /dev/null +++ b/internal/lambda/watchdog/sla.go @@ -0,0 +1,227 @@ +package watchdog + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" +) + +// scheduleSLAAlerts proactively creates EventBridge Scheduler entries for all +// pipelines with SLA configs. +func scheduleSLAAlerts(ctx context.Context, d *lambda.Deps) error { + if d.Scheduler == nil { + return nil + } + + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.Now() + + for id, cfg := range configs { + if cfg.SLA == nil { + continue + } + + if cfg.DryRun { + continue + } + + if lambda.IsExcluded(cfg, now) { + continue + } + + scheduleID := lambda.ResolveScheduleID(cfg) + date := resolveWatchdogSLADate(cfg, now) + + slaDate := date + if cfg.Schedule.Cron == "" && !strings.HasPrefix(cfg.SLA.Deadline, ":") { + t, err := time.Parse("2006-01-02", date) + if err == nil { + slaDate = t.AddDate(0, 0, 1).Format("2006-01-02") + } + } + + tr, err := d.Store.GetTrigger(ctx, id, scheduleID, date) + switch { + case err != nil: + d.Logger.Warn("trigger lookup failed in SLA scheduling", "pipelineId", id, "error", err) + continue + case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal): + continue + case lambda.IsJobTerminal(ctx, d, id, scheduleID, date): + continue + } + + calc, err := lambda.HandleSLACalculate(lambda.SLAMonitorInput{ + Mode: "calculate", + PipelineID: id, + ScheduleID: scheduleID, + Date: slaDate, + Deadline: cfg.SLA.Deadline, + ExpectedDuration: cfg.SLA.ExpectedDuration, + Timezone: cfg.SLA.Timezone, + }, now) + if err != nil { + d.Logger.Error("SLA calculate failed", "pipelineId", id, "error", err) + continue + } + + breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt) + if breachAt.IsZero() || breachAt.After(now) { + scheduleErr := false + if err := lambda.CreateSLASchedules(ctx, d, id, scheduleID, date, calc, true); err != nil { + d.Logger.Error("create SLA schedule failed", + "pipelineId", id, "error", err) + scheduleErr = true + } + + if !scheduleErr { + d.Logger.Info("proactive SLA schedules ensured", + "pipelineId", id, "date", date, + "warningAt", calc.WarningAt, "breachAt", calc.BreachAt) + } + } + } + return nil +} + +// checkTriggerDeadlines evaluates trigger deadlines independently of SLA config. +func checkTriggerDeadlines(ctx context.Context, d *lambda.Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.Now() + + for id, cfg := range configs { + if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" { + continue + } + + if cfg.DryRun { + continue + } + + if lambda.IsExcluded(cfg, now) { + continue + } + + scheduleID := lambda.ResolveScheduleID(cfg) + triggerDate := resolveTriggerDeadlineDate(cfg, now) + + triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate) + if err != nil { + d.Logger.Warn("trigger lookup failed in deadline check", "pipelineId", id, "error", err) + continue + } + if triggerRec != nil { + continue + } + + if lambda.IsJobTerminal(ctx, d, id, scheduleID, triggerDate) { + continue + } + + closeSensorTriggerWindow(ctx, d, id, scheduleID, triggerDate, cfg, now) + } + return nil +} + +func resolveWatchdogSLADate(cfg *types.PipelineConfig, now time.Time) string { + if strings.HasPrefix(cfg.SLA.Deadline, ":") { + prev := now.Add(-time.Hour) + return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) + } + return now.Format("2006-01-02") +} + +func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string { + if strings.HasPrefix(cfg.Schedule.Trigger.Deadline, ":") { + prev := now.Add(-time.Hour) + return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour()) + } + return now.Format("2006-01-02") +} + +func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time { + loc := lambda.ResolveTimezone(timezone) + + if strings.HasPrefix(deadline, ":") { + minute, err := strconv.Atoi(strings.TrimPrefix(deadline, ":")) + if err != nil { + return time.Time{} + } + if len(date) < 13 || date[10] != 'T' { + return time.Time{} + } + t, err := time.ParseInLocation("2006-01-02T15", date, loc) + if err != nil { + return time.Time{} + } + return time.Date(t.Year(), t.Month(), t.Day(), t.Hour()+1, minute, 0, 0, loc) + } + + parts := strings.SplitN(deadline, ":", 2) + if len(parts) != 2 { + return time.Time{} + } + hour, err := strconv.Atoi(parts[0]) + if err != nil { + return time.Time{} + } + minute, err := strconv.Atoi(parts[1]) + if err != nil { + return time.Time{} + } + t, err := time.ParseInLocation("2006-01-02", date, loc) + if err != nil { + return time.Time{} + } + return time.Date(t.Year(), t.Month(), t.Day(), hour, minute, 0, 0, loc) +} + +func closeSensorTriggerWindow(ctx context.Context, d *lambda.Deps, pipelineID, scheduleID, date string, cfg *types.PipelineConfig, now time.Time) { + tz := cfg.Schedule.Timezone + if tz == "" && cfg.SLA != nil { + tz = cfg.SLA.Timezone + } + triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz) + if triggerDeadline.IsZero() || triggerDeadline.After(now) { + return + } + + created, err := d.Store.CreateTriggerIfAbsent(ctx, pipelineID, scheduleID, date, types.TriggerStatusFailedFinal) + if err != nil { + d.Logger.Error("failed to write FAILED_FINAL for expired trigger deadline", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date, "error", err) + return + } + if !created { + d.Logger.Info("trigger appeared during deadline check, skipping window close", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date) + return + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "triggerDeadline": cfg.Schedule.Trigger.Deadline, + "actionHint": "auto-trigger window closed — use RERUN_REQUEST to restart", + } + if err := lambda.PublishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date, + fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID) + } + + d.Logger.Info("sensor trigger window closed", + "pipelineId", pipelineID, "schedule", scheduleID, "date", date, + "triggerDeadline", cfg.Schedule.Trigger.Deadline) +} diff --git a/internal/lambda/watchdog/stale.go b/internal/lambda/watchdog/stale.go new file mode 100644 index 0000000..16ca8dd --- /dev/null +++ b/internal/lambda/watchdog/stale.go @@ -0,0 +1,186 @@ +package watchdog + +import ( + "context" + "fmt" + "strings" + "time" + + lambda "github.com/dwsmith1983/interlock/internal/lambda" + "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" +) + +// detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and +// publishes an SFN_TIMEOUT event for any that have exceeded their TTL. +func detectStaleTriggers(ctx context.Context, d *lambda.Deps) error { + triggers, err := d.Store.ScanRunningTriggers(ctx) + if err != nil { + return fmt.Errorf("scan running triggers: %w", err) + } + + now := d.Now() + for _, tr := range triggers { + if !isStaleTrigger(tr, now) { + continue + } + + pipelineID, schedule, date, err := parseTriggerRecord(tr) + if err != nil { + d.Logger.Warn("skipping unparseable trigger", "pk", tr.PK, "sk", tr.SK, "error", err) + continue + } + + if cfg, cfgErr := d.ConfigCache.Get(ctx, pipelineID); cfgErr == nil && cfg != nil && cfg.DryRun { + continue + } + + alertDetail := map[string]interface{}{ + "source": "watchdog", + "actionHint": "step function exceeded TTL — check SFN execution history", + } + if tr.TTL > 0 { + alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339) + } + if err := lambda.PublishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date, + fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) + } + + if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil { + d.Logger.Error("failed to set trigger status to FAILED_FINAL", + "pipelineId", pipelineID, "schedule", schedule, "date", date, "error", err) + continue + } + + d.Logger.Info("detected stale trigger", + "pipelineId", pipelineID, "schedule", schedule, "date", date) + } + return nil +} + +func isStaleTrigger(tr types.ControlRecord, now time.Time) bool { + if tr.TTL > 0 { + return now.Unix() > tr.TTL + } + return false +} + +func parseTriggerRecord(tr types.ControlRecord) (pipelineID, schedule, date string, err error) { + const pkPrefix = "PIPELINE#" + if !strings.HasPrefix(tr.PK, pkPrefix) { + return "", "", "", fmt.Errorf("unexpected PK format: %q", tr.PK) + } + pipelineID = tr.PK[len(pkPrefix):] + + const skPrefix = "TRIGGER#" + trimmed := strings.TrimPrefix(tr.SK, skPrefix) + if trimmed == tr.SK { + return "", "", "", fmt.Errorf("unexpected SK format: %q", tr.SK) + } + parts := strings.SplitN(trimmed, "#", 2) + if len(parts) != 2 { + return "", "", "", fmt.Errorf("invalid TRIGGER SK format: %q", tr.SK) + } + return pipelineID, parts[0], parts[1], nil +} + +// reconcileSensorTriggers re-evaluates trigger conditions for sensor-triggered +// pipelines. Self-heals missed triggers caused by silent completion-write failures. +func reconcileSensorTriggers(ctx context.Context, d *lambda.Deps) error { + configs, err := d.ConfigCache.GetAll(ctx) + if err != nil { + return fmt.Errorf("load configs: %w", err) + } + + now := d.Now() + + for id, cfg := range configs { + trigger := cfg.Schedule.Trigger + if trigger == nil || cfg.Schedule.Cron != "" { + continue + } + + if cfg.DryRun { + continue + } + + if lambda.IsExcluded(cfg, now) { + continue + } + + sensors, err := d.Store.GetAllSensors(ctx, id) + if err != nil { + d.Logger.Error("failed to get sensors for reconciliation", + "pipelineId", id, "error", err) + continue + } + + scheduleID := lambda.ResolveScheduleID(cfg) + + for sensorKey, sensorData := range sensors { + if !strings.HasPrefix(sensorKey, trigger.Key) { + continue + } + + rule := types.ValidationRule{ + Key: trigger.Key, + Check: trigger.Check, + Field: trigger.Field, + Value: trigger.Value, + } + result := validation.EvaluateRule(rule, sensorData, now) + if !result.Passed { + continue + } + + date := lambda.ResolveExecutionDate(sensorData, now) + + found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date) + if err != nil { + d.Logger.Error("trigger check failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + if found { + continue + } + + if lambda.IsJobTerminal(ctx, d, id, scheduleID, date) { + continue + } + + acquired, err := d.Store.AcquireTriggerLock(ctx, id, scheduleID, date, lambda.ResolveTriggerLockTTL()) + if err != nil { + d.Logger.Error("lock acquisition failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + if !acquired { + continue + } + + if err := lambda.StartSFN(ctx, d, cfg, id, scheduleID, date); err != nil { + if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil { + d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr) + } + d.Logger.Error("SFN start failed during reconciliation", + "pipelineId", id, "date", date, "error", err) + continue + } + + alertDetail := map[string]interface{}{ + "source": "reconciliation", + "actionHint": "watchdog recovered missed sensor trigger", + } + if err := lambda.PublishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date, + fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil { + d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date) + } + + d.Logger.Info("recovered missed trigger", + "pipelineId", id, "schedule", scheduleID, "date", date) + } + } + return nil +} diff --git a/internal/lambda/watchdog_missed.go b/internal/lambda/watchdog_missed.go index cd94130..95c708b 100644 --- a/internal/lambda/watchdog_missed.go +++ b/internal/lambda/watchdog_missed.go @@ -56,7 +56,7 @@ func detectMissedSchedules(ctx context.Context, d *Deps) error { return fmt.Errorf("load configs: %w", err) } - now := d.now() + now := d.Now() today := now.Format("2006-01-02") for id, cfg := range configs { @@ -71,21 +71,21 @@ func detectMissedSchedules(ctx context.Context, d *Deps) error { } // Skip calendar-excluded days. - if isExcluded(cfg, now) { + if IsExcluded(cfg, now) { continue } // Only alert for schedules that should have fired after this Lambda // started. Prevents retroactive alerts after fresh deploys. if !d.StartedAt.IsZero() { - loc := resolveTimezone(cfg.Schedule.Timezone) + loc := ResolveTimezone(cfg.Schedule.Timezone) if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) { continue } } // Resolve schedule ID for cron pipelines. - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) // Check if any TRIGGER# row exists for today (covers both daily // and per-hour trigger rows, e.g. "2026-03-04" and "2026-03-04T00"). @@ -102,7 +102,7 @@ func detectMissedSchedules(ctx context.Context, d *Deps) error { // Check if we are past the expected start time. If the pipeline // has a schedule time configured, only alert after that time. if cfg.Schedule.Time != "" { - loc := resolveTimezone(cfg.Schedule.Timezone) + loc := ResolveTimezone(cfg.Schedule.Timezone) localNow := now.In(loc) expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc) if err == nil && localNow.Before(expectedStart) { @@ -118,7 +118,7 @@ func detectMissedSchedules(ctx context.Context, d *Deps) error { if cfg.Schedule.Time != "" { alertDetail["expectedTime"] = cfg.Schedule.Time } - if err := publishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today, + if err := PublishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today, fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil { d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) } @@ -143,7 +143,7 @@ func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { return fmt.Errorf("load configs: %w", err) } - now := d.now() + now := d.Now() for id, cfg := range configs { if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 { @@ -156,7 +156,7 @@ func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { } // Skip calendar-excluded days. - if isExcluded(cfg, now) { + if IsExcluded(cfg, now) { continue } @@ -165,11 +165,11 @@ func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { continue } - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) // Resolve today in the pipeline's timezone so the grace-period // guard fires correctly when UTC date != pipeline-local date. - tzLoc := resolveTimezone(cfg.Schedule.Timezone) + tzLoc := ResolveTimezone(cfg.Schedule.Timezone) today := now.In(tzLoc).Format("2006-01-02") for _, date := range pastDates { @@ -214,7 +214,7 @@ func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error { "source": "watchdog", "actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date), } - if err := publishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date, fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil { d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date) } diff --git a/internal/lambda/watchdog_postrun.go b/internal/lambda/watchdog_postrun.go index 677ac10..2c460a7 100644 --- a/internal/lambda/watchdog_postrun.go +++ b/internal/lambda/watchdog_postrun.go @@ -26,7 +26,7 @@ func detectMissingPostRunSensors(ctx context.Context, d *Deps) error { return fmt.Errorf("load configs: %w", err) } - now := d.now() + now := d.Now() today := now.Format("2006-01-02") for id, cfg := range configs { @@ -39,7 +39,7 @@ func detectMissingPostRunSensors(ctx context.Context, d *Deps) error { continue } - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) // Only check pipelines with a COMPLETED trigger for today. tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today) @@ -120,7 +120,7 @@ func detectMissingPostRunSensors(ctx context.Context, d *Deps) error { "ruleKeys": strings.Join(ruleKeys, ", "), "actionHint": "post-run sensor data has not arrived within the expected timeout", } - if err := publishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today, + if err := PublishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today, fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil { d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today) } @@ -231,7 +231,7 @@ func detectRelativeSLABreaches(ctx context.Context, d *Deps) error { return fmt.Errorf("load configs: %w", err) } - now := d.now() + now := d.Now() datesToCheck := []string{ now.Format("2006-01-02"), now.AddDate(0, 0, -1).Format("2006-01-02"), @@ -254,7 +254,7 @@ func detectRelativeSLABreaches(ctx context.Context, d *Deps) error { continue } - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) for _, checkDate := range datesToCheck { checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now) @@ -305,7 +305,7 @@ func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) { return } - if isJobTerminal(ctx, d, id, scheduleID, checkDate) { + if IsJobTerminal(ctx, d, id, scheduleID, checkDate) { return } @@ -329,7 +329,7 @@ func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types "breachAt": breachAt.UTC().Format(time.RFC3339), "actionHint": "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival", } - if err := publishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate, + if err := PublishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate, fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil { d.Logger.Warn("failed to publish relative SLA breach event", "error", err, "pipeline", id, "date", checkDate) diff --git a/internal/lambda/watchdog_sla.go b/internal/lambda/watchdog_sla.go index 7eab64e..c1394ab 100644 --- a/internal/lambda/watchdog_sla.go +++ b/internal/lambda/watchdog_sla.go @@ -24,7 +24,7 @@ func scheduleSLAAlerts(ctx context.Context, d *Deps) error { return fmt.Errorf("load configs: %w", err) } - now := d.now() + now := d.Now() for id, cfg := range configs { if cfg.SLA == nil { @@ -36,11 +36,11 @@ func scheduleSLAAlerts(ctx context.Context, d *Deps) error { continue } - if isExcluded(cfg, now) { + if IsExcluded(cfg, now) { continue } - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) date := resolveWatchdogSLADate(cfg, now) // Sensor-triggered daily pipelines run T+1: data for today completes @@ -64,7 +64,7 @@ func scheduleSLAAlerts(ctx context.Context, d *Deps) error { continue case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal): continue - case isJobTerminal(ctx, d, id, scheduleID, date): + case IsJobTerminal(ctx, d, id, scheduleID, date): continue } @@ -115,7 +115,7 @@ func checkTriggerDeadlines(ctx context.Context, d *Deps) error { return fmt.Errorf("load configs: %w", err) } - now := d.now() + now := d.Now() for id, cfg := range configs { if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" { @@ -127,11 +127,11 @@ func checkTriggerDeadlines(ctx context.Context, d *Deps) error { continue } - if isExcluded(cfg, now) { + if IsExcluded(cfg, now) { continue } - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) triggerDate := resolveTriggerDeadlineDate(cfg, now) triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate) @@ -143,7 +143,7 @@ func checkTriggerDeadlines(ctx context.Context, d *Deps) error { continue } - if isJobTerminal(ctx, d, id, scheduleID, triggerDate) { + if IsJobTerminal(ctx, d, id, scheduleID, triggerDate) { continue } @@ -189,7 +189,7 @@ func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string // Unlike handleSLACalculate, this does NOT roll forward when the time is past. // Returns zero time on parse errors. func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time { - loc := resolveTimezone(timezone) + loc := ResolveTimezone(timezone) if strings.HasPrefix(deadline, ":") { // Relative (hourly): ":MM" — deadline is in the NEXT hour after the @@ -267,7 +267,7 @@ func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, schedule "triggerDeadline": cfg.Schedule.Trigger.Deadline, "actionHint": "auto-trigger window closed — use RERUN_REQUEST to restart", } - if err := publishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date, fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil { d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID) } diff --git a/internal/lambda/watchdog_stale.go b/internal/lambda/watchdog_stale.go index cebfb57..7ee3138 100644 --- a/internal/lambda/watchdog_stale.go +++ b/internal/lambda/watchdog_stale.go @@ -6,8 +6,8 @@ import ( "strings" "time" - "github.com/dwsmith1983/interlock/internal/validation" "github.com/dwsmith1983/interlock/pkg/types" + "github.com/dwsmith1983/interlock/pkg/validation" ) // detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and @@ -19,7 +19,7 @@ func detectStaleTriggers(ctx context.Context, d *Deps) error { return fmt.Errorf("scan running triggers: %w", err) } - now := d.now() + now := d.Now() for _, tr := range triggers { if !isStaleTrigger(tr, now) { continue @@ -44,7 +44,7 @@ func detectStaleTriggers(ctx context.Context, d *Deps) error { if tr.TTL > 0 { alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339) } - if err := publishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date, + if err := PublishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date, fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil { d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date) } @@ -109,7 +109,7 @@ func reconcileSensorTriggers(ctx context.Context, d *Deps) error { return fmt.Errorf("load configs: %w", err) } - now := d.now() + now := d.Now() for id, cfg := range configs { trigger := cfg.Schedule.Trigger @@ -122,7 +122,7 @@ func reconcileSensorTriggers(ctx context.Context, d *Deps) error { continue } - if isExcluded(cfg, now) { + if IsExcluded(cfg, now) { continue } @@ -133,7 +133,7 @@ func reconcileSensorTriggers(ctx context.Context, d *Deps) error { continue } - scheduleID := resolveScheduleID(cfg) + scheduleID := ResolveScheduleID(cfg) for sensorKey, sensorData := range sensors { if !strings.HasPrefix(sensorKey, trigger.Key) { @@ -166,7 +166,7 @@ func reconcileSensorTriggers(ctx context.Context, d *Deps) error { // Guard against re-triggering completed pipelines whose trigger // record was deleted by DynamoDB TTL. Check the joblog for a // terminal event before acquiring a new lock. - if isJobTerminal(ctx, d, id, scheduleID, date) { + if IsJobTerminal(ctx, d, id, scheduleID, date) { continue } @@ -180,7 +180,7 @@ func reconcileSensorTriggers(ctx context.Context, d *Deps) error { continue } - if err := startSFN(ctx, d, cfg, id, scheduleID, date); err != nil { + if err := StartSFN(ctx, d, cfg, id, scheduleID, date); err != nil { if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil { d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr) } @@ -193,7 +193,7 @@ func reconcileSensorTriggers(ctx context.Context, d *Deps) error { "source": "reconciliation", "actionHint": "watchdog recovered missed sensor trigger", } - if err := publishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date, + if err := PublishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date, fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil { d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date) } diff --git a/internal/lambda/watchdog_test.go b/internal/lambda/watchdog_test.go index bc77b82..d687b13 100644 --- a/internal/lambda/watchdog_test.go +++ b/internal/lambda/watchdog_test.go @@ -3244,7 +3244,7 @@ func TestResolveTriggerDeadlineTime_UsesScheduleTimezone(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := lambda.ResolveTriggerDeadlineTime(tt.deadline, tt.date, tt.timezone) + got := lambda.ExportedResolveTriggerDeadlineTime(tt.deadline, tt.date, tt.timezone) require.False(t, got.IsZero(), "expected non-zero time") assert.Equal(t, tt.wantHour, got.Hour(), "hour mismatch") assert.Equal(t, tt.wantMin, got.Minute(), "minute mismatch") diff --git a/internal/store/configcache.go b/internal/store/configcache.go index 573bc39..10d228c 100644 --- a/internal/store/configcache.go +++ b/internal/store/configcache.go @@ -2,7 +2,6 @@ package store import ( "context" - "encoding/json" "sync" "time" @@ -80,20 +79,7 @@ func (c *ConfigCache) refresh(ctx context.Context) (map[string]*types.PipelineCo func copyConfigs(src map[string]*types.PipelineConfig) map[string]*types.PipelineConfig { dst := make(map[string]*types.PipelineConfig, len(src)) for k, v := range src { - data, err := json.Marshal(v) - if err != nil { - // Marshal of a known struct should never fail; shallow-copy as fallback. - cp := *v - dst[k] = &cp - continue - } - var cp types.PipelineConfig - if err := json.Unmarshal(data, &cp); err != nil { - shallow := *v - dst[k] = &shallow - continue - } - dst[k] = &cp + dst[k] = v.DeepCopy() } return dst } diff --git a/pkg/sla/deadline.go b/pkg/sla/deadline.go new file mode 100644 index 0000000..346b246 --- /dev/null +++ b/pkg/sla/deadline.go @@ -0,0 +1,128 @@ +// Package sla provides pure time calculation functions for SLA deadline computation. +package sla + +import ( + "fmt" + "strconv" + "strings" + "time" +) + +// CalculateAbsoluteDeadline computes breach and warning times from a deadline string. +// +// Deadline formats: +// - "HH:MM" — absolute time of day (daily pipelines) +// - ":MM" — minutes past the processing hour (hourly pipelines) +// +// Date formats: +// - "2006-01-02" — daily +// - "2006-01-02T15" — hourly (hour encoded in date) +// +// The warning time is breachAt minus expectedDuration. +func CalculateAbsoluteDeadline(date, deadline, expectedDuration, timezone string, now time.Time) (breach, warning time.Time, err error) { + dur, err := time.ParseDuration(expectedDuration) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse expectedDuration %q: %w", expectedDuration, err) + } + + loc := time.UTC + if timezone != "" { + loc, err = time.LoadLocation(timezone) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("load timezone %q: %w", timezone, err) + } + } + + now = now.In(loc) + + // Parse the execution date. + baseDate := now + baseHour := -1 + if date != "" { + datePart, hourPart := parseExecutionDate(date) + parsed, parseErr := time.Parse("2006-01-02", datePart) + if parseErr == nil { + if hourPart != "" { + h := 0 + if hVal, atoiErr := strconv.Atoi(hourPart); atoiErr == nil { + h = hVal + baseHour = h + } + baseDate = time.Date(parsed.Year(), parsed.Month(), parsed.Day(), + h, 0, 0, 0, loc) + } else { + baseDate = time.Date(parsed.Year(), parsed.Month(), parsed.Day(), + now.Hour(), now.Minute(), 0, 0, loc) + } + } + } + + // Parse deadline. + if strings.HasPrefix(deadline, ":") { + dl, parseErr := time.Parse("04", strings.TrimPrefix(deadline, ":")) + if parseErr != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse deadline %q: %w", deadline, parseErr) + } + hour := baseDate.Hour() + breach = time.Date(baseDate.Year(), baseDate.Month(), baseDate.Day(), + hour, dl.Minute(), 0, 0, loc) + if baseHour >= 0 { + breach = breach.Add(time.Hour) + } else if breach.Before(now) { + breach = breach.Add(time.Hour) + } + } else { + dl, parseErr := time.Parse("15:04", deadline) + if parseErr != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse deadline %q: %w", deadline, parseErr) + } + breach = time.Date(baseDate.Year(), baseDate.Month(), baseDate.Day(), + dl.Hour(), dl.Minute(), 0, 0, loc) + if breach.Before(now) { + breach = breach.Add(24 * time.Hour) + } + } + + warning = breach.Add(-dur) + return breach, warning, nil +} + +// CalculateRelativeDeadline computes breach and warning times from sensor arrival +// plus maxDuration. Warning offset uses expectedDuration if provided, otherwise +// defaults to 25% of maxDuration. +func CalculateRelativeDeadline(arrivalAt, maxDuration, expectedDuration string) (breach, warning time.Time, err error) { + maxDur, err := time.ParseDuration(maxDuration) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse maxDuration %q: %w", maxDuration, err) + } + + arrival, err := time.Parse(time.RFC3339, arrivalAt) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse arrivalAt %q: %w", arrivalAt, err) + } + + breach = arrival.Add(maxDur) + + var warningOffset time.Duration + if expectedDuration != "" { + warningOffset, err = time.ParseDuration(expectedDuration) + if err != nil { + return time.Time{}, time.Time{}, fmt.Errorf("parse expectedDuration %q: %w", expectedDuration, err) + } + } else { + warningOffset = maxDur / 4 + } + warning = breach.Add(-warningOffset) + + return breach, warning, nil +} + +// parseExecutionDate splits a composite date into date and hour parts. +// "2026-03-03T10" -> ("2026-03-03", "10") +// "2026-03-03" -> ("2026-03-03", "") +func parseExecutionDate(date string) (datePart, hourPart string) { + if idx := strings.Index(date, "T"); idx >= 0 { + return date[:idx], date[idx+1:] + } + return date, "" +} diff --git a/pkg/sla/deadline_test.go b/pkg/sla/deadline_test.go new file mode 100644 index 0000000..a7de3ad --- /dev/null +++ b/pkg/sla/deadline_test.go @@ -0,0 +1,177 @@ +package sla_test + +import ( + "testing" + "time" + + "github.com/dwsmith1983/interlock/pkg/sla" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCalculateAbsoluteDeadline(t *testing.T) { + tests := []struct { + name string + date string + deadline string + expectedDuration string + timezone string + now time.Time + wantBreach time.Time + wantWarning time.Time + wantErr bool + }{ + { + name: "daily pipeline UTC", + date: "2026-03-28", + deadline: "08:00", + expectedDuration: "30m", + timezone: "UTC", + now: time.Date(2026, 3, 28, 6, 0, 0, 0, time.UTC), + wantBreach: time.Date(2026, 3, 28, 8, 0, 0, 0, time.UTC), + wantWarning: time.Date(2026, 3, 28, 7, 30, 0, 0, time.UTC), + }, + { + name: "hourly pipeline with offset deadline", + date: "2026-03-28T10", + deadline: ":30", + expectedDuration: "15m", + timezone: "UTC", + now: time.Date(2026, 3, 28, 10, 0, 0, 0, time.UTC), + wantBreach: time.Date(2026, 3, 28, 11, 30, 0, 0, time.UTC), + wantWarning: time.Date(2026, 3, 28, 11, 15, 0, 0, time.UTC), + }, + { + name: "daily pipeline America/New_York", + date: "2026-03-28", + deadline: "09:00", + expectedDuration: "1h", + timezone: "America/New_York", + now: time.Date(2026, 3, 28, 6, 0, 0, 0, time.UTC), + wantBreach: func() time.Time { + loc, _ := time.LoadLocation("America/New_York") + return time.Date(2026, 3, 28, 9, 0, 0, 0, loc) + }(), + wantWarning: func() time.Time { + loc, _ := time.LoadLocation("America/New_York") + return time.Date(2026, 3, 28, 8, 0, 0, 0, loc) + }(), + }, + { + name: "invalid deadline format", + date: "2026-03-28", + deadline: "8pm", + expectedDuration: "30m", + timezone: "UTC", + now: time.Date(2026, 3, 28, 6, 0, 0, 0, time.UTC), + wantErr: true, + }, + { + name: "invalid timezone", + date: "2026-03-28", + deadline: "08:00", + expectedDuration: "30m", + timezone: "Mars/Olympus_Mons", + now: time.Date(2026, 3, 28, 6, 0, 0, 0, time.UTC), + wantErr: true, + }, + { + name: "empty date uses now for base", + date: "", + deadline: "08:00", + expectedDuration: "30m", + timezone: "UTC", + now: time.Date(2026, 3, 28, 6, 0, 0, 0, time.UTC), + wantBreach: time.Date(2026, 3, 28, 8, 0, 0, 0, time.UTC), + wantWarning: time.Date(2026, 3, 28, 7, 30, 0, 0, time.UTC), + }, + { + name: "invalid expectedDuration", + date: "2026-03-28", + deadline: "08:00", + expectedDuration: "not-a-duration", + timezone: "UTC", + now: time.Date(2026, 3, 28, 6, 0, 0, 0, time.UTC), + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + breach, warning, err := sla.CalculateAbsoluteDeadline( + tt.date, tt.deadline, tt.expectedDuration, tt.timezone, tt.now, + ) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + assert.True(t, tt.wantBreach.Equal(breach), + "breach: want %v, got %v", tt.wantBreach, breach) + assert.True(t, tt.wantWarning.Equal(warning), + "warning: want %v, got %v", tt.wantWarning, warning) + }) + } +} + +func TestCalculateRelativeDeadline(t *testing.T) { + baseArrival := time.Date(2026, 3, 28, 10, 0, 0, 0, time.UTC) + + tests := []struct { + name string + arrivalAt string + maxDuration string + expectedDuration string + wantBreach time.Time + wantWarning time.Time + wantErr bool + }{ + { + name: "with explicit expectedDuration", + arrivalAt: baseArrival.Format(time.RFC3339), + maxDuration: "2h", + expectedDuration: "30m", + wantBreach: baseArrival.Add(2 * time.Hour), // 12:00 + wantWarning: baseArrival.Add(2*time.Hour - 30*time.Minute), // 11:30 + }, + { + name: "without expectedDuration uses 25% of maxDuration", + arrivalAt: baseArrival.Format(time.RFC3339), + maxDuration: "2h", + expectedDuration: "", + wantBreach: baseArrival.Add(2 * time.Hour), // 12:00 + wantWarning: baseArrival.Add(2*time.Hour - 30*time.Minute), // 12:00 - 25% of 2h = 11:30 + }, + { + name: "invalid arrivalAt", + arrivalAt: "not-a-timestamp", + maxDuration: "2h", + expectedDuration: "", + wantErr: true, + }, + { + name: "invalid maxDuration", + arrivalAt: baseArrival.Format(time.RFC3339), + maxDuration: "bogus", + expectedDuration: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + breach, warning, err := sla.CalculateRelativeDeadline( + tt.arrivalAt, tt.maxDuration, tt.expectedDuration, + ) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + assert.True(t, tt.wantBreach.Equal(breach), + "breach: want %v, got %v", tt.wantBreach, breach) + assert.True(t, tt.wantWarning.Equal(warning), + "warning: want %v, got %v", tt.wantWarning, warning) + }) + } +} diff --git a/pkg/types/events.go b/pkg/types/events.go index f712ea9..513e955 100644 --- a/pkg/types/events.go +++ b/pkg/types/events.go @@ -46,6 +46,7 @@ const ( EventDryRunRerunRejected EventDetailType = "DRY_RUN_RERUN_REJECTED" EventDryRunWouldRetry EventDetailType = "DRY_RUN_WOULD_RETRY" EventDryRunRetryExhausted EventDetailType = "DRY_RUN_RETRY_EXHAUSTED" + EventWatchdogDegraded EventDetailType = "WATCHDOG_DEGRADED" ) // EventSource is the EventBridge source for all interlock events. diff --git a/pkg/types/events_test.go b/pkg/types/events_test.go index 56cd418..ffb3bfa 100644 --- a/pkg/types/events_test.go +++ b/pkg/types/events_test.go @@ -29,6 +29,7 @@ func TestEventDetailTypeConstants(t *testing.T) { {types.EventInfraFailure, "INFRA_FAILURE"}, {types.EventLateDataArrival, "LATE_DATA_ARRIVAL"}, {types.EventRerunRejected, "RERUN_REJECTED"}, + {types.EventWatchdogDegraded, "WATCHDOG_DEGRADED"}, } for _, tt := range tests { diff --git a/pkg/types/pipeline.go b/pkg/types/pipeline.go index 97f01f4..29fbb60 100644 --- a/pkg/types/pipeline.go +++ b/pkg/types/pipeline.go @@ -1,6 +1,11 @@ // Package types defines the public domain types for the Interlock STAMP-based safety framework. package types +import ( + "encoding/json" + "slices" +) + // PipelineConfig is the full configuration for a pipeline, loaded from YAML. type PipelineConfig struct { Pipeline PipelineIdentity `yaml:"pipeline" json:"pipeline"` @@ -122,3 +127,93 @@ type PostRunConfig struct { DriftThreshold *float64 `yaml:"driftThreshold,omitempty" json:"driftThreshold,omitempty"` // minimum absolute delta to trigger drift; default 0 (any change) DriftField string `yaml:"driftField,omitempty" json:"driftField,omitempty"` // sensor field for drift comparison; default "sensor_count" } + +func shallowCopyMap(src map[string]interface{}) map[string]interface{} { + dst := make(map[string]interface{}, len(src)) + for k, v := range src { + dst[k] = v + } + return dst +} + +// DeepCopy returns a deep copy of the PipelineConfig. Struct-typed fields, +// pointer fields, slices, and maps are copied so mutations to the copy do not +// affect the original. The interface{}-typed Value fields on ValidationRule and +// TriggerCondition are shallow-copied; in practice these hold YAML-decoded +// primitives (string, float64, bool). +func (c *PipelineConfig) DeepCopy() *PipelineConfig { + cp := *c + + // Schedule pointer fields + if c.Schedule.Trigger != nil { + t := *c.Schedule.Trigger + cp.Schedule.Trigger = &t + } + if c.Schedule.Exclude != nil { + ex := *c.Schedule.Exclude + ex.Dates = slices.Clone(c.Schedule.Exclude.Dates) + cp.Schedule.Exclude = &ex + } + if c.Schedule.Include != nil { + inc := *c.Schedule.Include + inc.Dates = slices.Clone(c.Schedule.Include.Dates) + cp.Schedule.Include = &inc + } + + // SLA + if c.SLA != nil { + s := *c.SLA + cp.SLA = &s + } + + // Validation rules slice + cp.Validation.Rules = slices.Clone(c.Validation.Rules) + + // Job pointer fields + if c.Job.JobPollWindowSeconds != nil { + v := *c.Job.JobPollWindowSeconds + cp.Job.JobPollWindowSeconds = &v + } + if c.Job.MaxDriftReruns != nil { + v := *c.Job.MaxDriftReruns + cp.Job.MaxDriftReruns = &v + } + if c.Job.MaxManualReruns != nil { + v := *c.Job.MaxManualReruns + cp.Job.MaxManualReruns = &v + } + if c.Job.MaxCodeRetries != nil { + v := *c.Job.MaxCodeRetries + cp.Job.MaxCodeRetries = &v + } + + // Job.Config map — JSON roundtrip for map[string]interface{} with shallow-copy fallback. + if c.Job.Config != nil { + data, err := json.Marshal(c.Job.Config) + if err == nil { + cp.Job.Config = make(map[string]interface{}, len(c.Job.Config)) + if err := json.Unmarshal(data, &cp.Job.Config); err != nil { + cp.Job.Config = shallowCopyMap(c.Job.Config) + } + } else { + cp.Job.Config = shallowCopyMap(c.Job.Config) + } + } + + // PostRun + if c.PostRun != nil { + pr := *c.PostRun + pr.Rules = slices.Clone(c.PostRun.Rules) + if c.PostRun.Evaluation != nil { + ev := *c.PostRun.Evaluation + pr.Evaluation = &ev + } + if c.PostRun.DriftThreshold != nil { + v := *c.PostRun.DriftThreshold + pr.DriftThreshold = &v + } + cp.PostRun = &pr + } + + return &cp +} diff --git a/pkg/types/pipeline_test.go b/pkg/types/pipeline_test.go index e4b9c86..92ba240 100644 --- a/pkg/types/pipeline_test.go +++ b/pkg/types/pipeline_test.go @@ -260,3 +260,163 @@ func TestPipelineConfig_CalendarSchedule(t *testing.T) { assert.Nil(t, cfg.PostRun) } + +func TestPipelineConfig_DeepCopy(t *testing.T) { + t.Run("fully populated config", func(t *testing.T) { + var cfg types.PipelineConfig + require.NoError(t, yaml.Unmarshal([]byte(samplePipelineYAML), &cfg)) + + cp := cfg.DeepCopy() + + // The copy must equal the original in every field. + assert.Equal(t, cfg.Pipeline, cp.Pipeline) + assert.Equal(t, cfg.Schedule.Cron, cp.Schedule.Cron) + assert.Equal(t, cfg.Schedule.Timezone, cp.Schedule.Timezone) + assert.Equal(t, cfg.Schedule.Calendar, cp.Schedule.Calendar) + assert.Equal(t, cfg.Schedule.Time, cp.Schedule.Time) + assert.Equal(t, cfg.Schedule.Evaluation, cp.Schedule.Evaluation) + + require.NotNil(t, cp.Schedule.Exclude) + assert.Equal(t, cfg.Schedule.Exclude.Weekends, cp.Schedule.Exclude.Weekends) + assert.Equal(t, cfg.Schedule.Exclude.Holidays, cp.Schedule.Exclude.Holidays) + assert.Equal(t, cfg.Schedule.Exclude.Dates, cp.Schedule.Exclude.Dates) + + require.NotNil(t, cp.SLA) + assert.Equal(t, cfg.SLA.Deadline, cp.SLA.Deadline) + assert.Equal(t, cfg.SLA.ExpectedDuration, cp.SLA.ExpectedDuration) + assert.Equal(t, cfg.SLA.Critical, cp.SLA.Critical) + + assert.Equal(t, cfg.Validation.Trigger, cp.Validation.Trigger) + require.Len(t, cp.Validation.Rules, len(cfg.Validation.Rules)) + for i := range cfg.Validation.Rules { + assert.Equal(t, cfg.Validation.Rules[i], cp.Validation.Rules[i]) + } + + assert.Equal(t, cfg.Job.Type, cp.Job.Type) + assert.Equal(t, cfg.Job.MaxRetries, cp.Job.MaxRetries) + assert.Equal(t, cfg.Job.Config["jobName"], cp.Job.Config["jobName"]) + + require.NotNil(t, cp.PostRun) + require.Len(t, cp.PostRun.Rules, len(cfg.PostRun.Rules)) + assert.Equal(t, cfg.PostRun.Rules[0], cp.PostRun.Rules[0]) + + assert.Equal(t, cfg.DryRun, cp.DryRun) + + // --- Mutation: changing the copy must NOT affect the original --- + + // Mutate a validation rule in the copy. + cp.Validation.Rules[0].Key = "SENSOR#mutated" + assert.Equal(t, "SENSOR#upstream-complete", cfg.Validation.Rules[0].Key, + "mutating copy's validation rule must not affect original") + + // Mutate Job.Config map in the copy. + cp.Job.Config["jobName"] = "mutated-job" + assert.Equal(t, "gold-revenue-etl", cfg.Job.Config["jobName"], + "mutating copy's job config must not affect original") + + // Mutate PostRun rules in the copy. + cp.PostRun.Rules[0].Key = "SENSOR#mutated-postrun" + assert.Equal(t, "SENSOR#output-count", cfg.PostRun.Rules[0].Key, + "mutating copy's postRun rule must not affect original") + + // Mutate SLA in the copy. + cp.SLA.Deadline = "23:59" + assert.Equal(t, "08:00", cfg.SLA.Deadline, + "mutating copy's SLA must not affect original") + }) + + t.Run("stream-triggered with trigger pointer", func(t *testing.T) { + var cfg types.PipelineConfig + require.NoError(t, yaml.Unmarshal([]byte(streamTriggeredYAML), &cfg)) + + cp := cfg.DeepCopy() + + require.NotNil(t, cp.Schedule.Trigger) + assert.Equal(t, cfg.Schedule.Trigger.Key, cp.Schedule.Trigger.Key) + + // Mutate the trigger key in the copy. + cp.Schedule.Trigger.Key = "SENSOR#mutated-trigger" + assert.Equal(t, "SENSOR#raw-orders-landed", cfg.Schedule.Trigger.Key, + "mutating copy's trigger must not affect original") + }) + + t.Run("calendar with exclusion dates", func(t *testing.T) { + var cfg types.PipelineConfig + require.NoError(t, yaml.Unmarshal([]byte(calendarScheduleYAML), &cfg)) + + cp := cfg.DeepCopy() + + require.NotNil(t, cp.Schedule.Exclude) + require.Len(t, cp.Schedule.Exclude.Dates, 2) + + // Mutate exclusion dates in the copy. + cp.Schedule.Exclude.Dates[0] = "2099-12-31" + assert.Equal(t, "2026-01-01", cfg.Schedule.Exclude.Dates[0], + "mutating copy's exclusion dates must not affect original") + }) + + t.Run("pointer field isolation (*int and *float64)", func(t *testing.T) { + driftReruns := 3 + manualReruns := 2 + codeRetries := 1 + pollWindow := 300 + driftThreshold := 0.05 + + cfg := types.PipelineConfig{ + Pipeline: types.PipelineIdentity{ID: "ptr-test"}, + Schedule: types.ScheduleConfig{ + Evaluation: types.EvaluationWindow{Window: "1h", Interval: "5m"}, + }, + Validation: types.ValidationConfig{Trigger: "ALL"}, + Job: types.JobConfig{ + Type: "glue", + MaxDriftReruns: &driftReruns, + MaxManualReruns: &manualReruns, + MaxCodeRetries: &codeRetries, + JobPollWindowSeconds: &pollWindow, + }, + PostRun: &types.PostRunConfig{ + DriftThreshold: &driftThreshold, + Rules: []types.ValidationRule{{Key: "s1", Check: "exists"}}, + }, + } + + cp := cfg.DeepCopy() + + // Mutate pointer values via the copy. + *cp.Job.MaxDriftReruns = 99 + *cp.Job.MaxManualReruns = 99 + *cp.Job.MaxCodeRetries = 99 + *cp.Job.JobPollWindowSeconds = 99 + *cp.PostRun.DriftThreshold = 99.9 + + // Originals must be unaffected. + assert.Equal(t, 3, *cfg.Job.MaxDriftReruns, "MaxDriftReruns shared") + assert.Equal(t, 2, *cfg.Job.MaxManualReruns, "MaxManualReruns shared") + assert.Equal(t, 1, *cfg.Job.MaxCodeRetries, "MaxCodeRetries shared") + assert.Equal(t, 300, *cfg.Job.JobPollWindowSeconds, "JobPollWindowSeconds shared") + assert.Equal(t, 0.05, *cfg.PostRun.DriftThreshold, "DriftThreshold shared") + }) + + t.Run("nil optional fields", func(t *testing.T) { + var cfg types.PipelineConfig + require.NoError(t, yaml.Unmarshal([]byte(minimalPipelineYAML), &cfg)) + + // Precondition: these fields are nil on the minimal config. + require.Nil(t, cfg.SLA) + require.Nil(t, cfg.PostRun) + require.Nil(t, cfg.Schedule.Trigger) + require.Nil(t, cfg.Schedule.Exclude) + + cp := cfg.DeepCopy() + + assert.Nil(t, cp.SLA) + assert.Nil(t, cp.PostRun) + assert.Nil(t, cp.Schedule.Trigger) + assert.Nil(t, cp.Schedule.Exclude) + + // Verify non-nil fields still copied correctly. + assert.Equal(t, "bronze-ingest", cp.Pipeline.ID) + assert.Len(t, cp.Validation.Rules, 1) + }) +} diff --git a/internal/validation/config.go b/pkg/validation/config.go similarity index 100% rename from internal/validation/config.go rename to pkg/validation/config.go diff --git a/internal/validation/config_test.go b/pkg/validation/config_test.go similarity index 100% rename from internal/validation/config_test.go rename to pkg/validation/config_test.go diff --git a/internal/validation/engine.go b/pkg/validation/engine.go similarity index 100% rename from internal/validation/engine.go rename to pkg/validation/engine.go diff --git a/internal/validation/engine_test.go b/pkg/validation/engine_test.go similarity index 100% rename from internal/validation/engine_test.go rename to pkg/validation/engine_test.go