diff --git a/CHANGELOG.md b/CHANGELOG.md
index 26008b7..f796f43 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,40 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.9.3] - 2026-03-13
+
+### Changed
+
+- **Shared HTTP client construction (DRY-2)** — Extracted `resolveHTTPClient()` replacing identical 7-line blocks in `ExecuteHTTP` and `ExecuteAirflow`.
+- **Shared SLA schedule creation loop (DRY-3)** — Extracted `createSLASchedules()` replacing duplicated warning/breach schedule loops in watchdog and sla-monitor.
+- **Split watchdog.go into focused files** — 1079-line monolith split into 5 files by domain: stale triggers, missed schedules, SLA alerting, and post-run monitoring (~200 lines each).
+
+### Security
+
+- **Command trigger shell injection eliminated (SEC-3)** — Replaced `sh -c` with direct `exec.CommandContext` + `strings.Fields` argument splitting. No shell interpretation of pipes, redirects, or variable expansion.
+
+## [0.9.2] - 2026-03-13
+
+### Fixed
+
+- **Drift detection silently skipped zero values (BUG-1)** — `ExtractFloat` returned 0 for both missing keys and actual zero values, causing the `prevCount > 0` guard to silently skip legitimate transitions like 5000→0 or 0→5000. New `ExtractFloatOk` distinguishes absent from zero. Shared `DetectDrift` function consolidates 3 duplicated drift comparison sites.
+- **RemapPerPeriodSensors map mutation during range (BUG-2)** — Adding keys during `range` iteration over a Go map is nondeterministic per the spec. Staging map now collects additions, merged after iteration.
+- **Orphaned rerun burns retry budget (BUG-3)** — `handleRerunRequest` wrote the rerun record before acquiring the trigger lock. If lock acquisition failed, the rerun record was left orphaned and permanently consumed retry budget. Reordered to lock-first, then write.
+- **Stream router discarded partial batch failures (BUG-4)** — `HandleStreamEvent` returned a single error, causing Lambda to retry the entire batch. Now returns `DynamoDBEventResponse` with per-record `BatchItemFailures` for partial retry via `ReportBatchItemFailures`.
+- **SLA_MET published when pipeline never ran (BUG-5)** — `handleSLACancel` published SLA_MET regardless of whether a trigger existed. Now checks for trigger existence first.
+- **Trigger deadline used SLA timezone instead of schedule timezone (BUG-6)** — `closeSensorTriggerWindow` read timezone from `cfg.SLA.Timezone` instead of `cfg.Schedule.Timezone`. Falls back to SLA timezone if schedule timezone is not set.
+- **Validation mode case-sensitive (BUG-8)** — `EvaluateRules` matched mode with `switch mode` so "any" fell through to the default ALL branch. Now uses `strings.ToUpper(mode)`.
+- **Epoch timestamp unit mismatch in rerun freshness (BUG-9)** — `checkSensorFreshness` compared raw epoch values without normalizing units. Timestamps below 1e12 (seconds) are now converted to milliseconds.
+- **Post-run baseline field collision (BUG-10)** — Baseline was stored as a flat map, so two rules with the same field name overwrote each other. Now namespaced by rule key. Clean break: existing flat baselines self-heal on next pipeline completion.
+- **publishEvent errors silently discarded in SLA reconcile (CQ-5)** — Replaced `_ = publishEvent(...)` with error-logged calls.
+
+### Security
+
+- **lambda_trigger_arns default changed to [] with precondition (SEC-1)** — Wildcard default removed; explicit ARN list required when triggers are enabled.
+- **Slack plaintext token deprecation warning (SEC-2)** — Terraform `check` block warns at plan time when plaintext token is used without Secrets Manager.
+- **Trigger IAM policy scoping (SEC-4)** — New variables `glue_job_arns`, `emr_cluster_arns`, `emr_serverless_app_arns`, `sfn_trigger_arns` (all default `[]`) with preconditions requiring non-empty values when the corresponding trigger is enabled.
+- **EventBridge bus resource policy (SEC-5)** — Restricts PutEvents to Lambda execution roles only.
+
 ## [0.9.1] - 2026-03-13
 
 ### Added
diff --git a/cmd/lambda/stream-router/main.go b/cmd/lambda/stream-router/main.go
index 1354881..60c2430 100644
--- a/cmd/lambda/stream-router/main.go
+++ b/cmd/lambda/stream-router/main.go
@@ -10,6 +10,7 @@ import (
 	"os"
 	"time"
 
+	"github.com/aws/aws-lambda-go/events"
 	"github.com/aws/aws-lambda-go/lambda"
 	awsconfig "github.com/aws/aws-sdk-go-v2/config"
 	"github.com/aws/aws-sdk-go-v2/service/dynamodb"
@@ -53,7 +54,7 @@ func main() {
 		Logger:          logger,
 	}
 
-	lambda.Start(func(ctx context.Context, event ilambda.StreamEvent) error {
+	lambda.Start(func(ctx context.Context, event ilambda.StreamEvent) (events.DynamoDBEventResponse, error) {
 		return ilambda.HandleStreamEvent(ctx, deps, event)
 	})
 }
diff --git a/deploy/terraform/eventbridge.tf b/deploy/terraform/eventbridge.tf
index c55a45a..5fb8e79 100644
--- a/deploy/terraform/eventbridge.tf
+++ b/deploy/terraform/eventbridge.tf
@@ -3,6 +3,25 @@ resource "aws_cloudwatch_event_bus" "interlock" {
   tags = var.tags
 }
 
+resource "aws_cloudwatch_event_bus_policy" "interlock_bus" {
+  event_bus_name = aws_cloudwatch_event_bus.interlock.name
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Sid       = "AllowInterlockLambdas"
+        Effect    = "Allow"
+        Principal = {
+          AWS = [for name in local.lambda_names : aws_iam_role.lambda[name].arn]
+        }
+        Action   = "events:PutEvents"
+        Resource = aws_cloudwatch_event_bus.interlock.arn
+      }
+    ]
+  })
+}
+
 # Watchdog schedule
 resource "aws_cloudwatch_event_rule" "watchdog" {
   name                = "${var.environment}-interlock-watchdog"
@@ -100,6 +119,10 @@ resource "aws_cloudwatch_event_rule" "alert_events" {
       "DRY_RUN_SLA_PROJECTION",
       "DRY_RUN_DRIFT",
       "DRY_RUN_COMPLETED",
+      "DRY_RUN_WOULD_RERUN",
+      "DRY_RUN_RERUN_REJECTED",
+      "DRY_RUN_WOULD_RETRY",
+      "DRY_RUN_RETRY_EXHAUSTED",
     ]
   })
 }
diff --git a/deploy/terraform/lambda.tf b/deploy/terraform/lambda.tf
index 0d636c1..d3cf012 100644
--- a/deploy/terraform/lambda.tf
+++ b/deploy/terraform/lambda.tf
@@ -541,6 +541,17 @@ resource "aws_lambda_event_source_mapping" "joblog_stream" {
   }
 }
 
+# =============================================================================
+# Security checks
+# =============================================================================
+
+check "slack_token_deprecation" {
+  assert {
+    condition     = var.slack_bot_token == "" || var.slack_secret_arn != ""
+    error_message = "DEPRECATED: Passing a plaintext Slack bot token is deprecated. Use var.slack_secret_arn with an AWS Secrets Manager ARN instead. Plaintext path still works but will be removed in a future version."
+  }
+}
+
 # =============================================================================
 # Conditional trigger permissions for orchestrator (opt-in per trigger type)
 # =============================================================================
@@ -552,13 +563,20 @@ resource "aws_iam_role_policy" "glue_trigger" {
   name  = "glue-trigger"
   role  = aws_iam_role.lambda["orchestrator"].id
 
+  lifecycle {
+    precondition {
+      condition     = length(var.glue_job_arns) > 0
+      error_message = "glue_job_arns must be non-empty when enable_glue_trigger is true."
+    }
+  }
+
   policy = jsonencode({
     Version = "2012-10-17"
     Statement = [
       {
         Effect   = "Allow"
         Action   = ["glue:StartJobRun", "glue:GetJobRun"]
-        Resource = "*"
+        Resource = var.glue_job_arns
       },
       {
         Sid    = "GlueLogVerification"
@@ -580,12 +598,19 @@ resource "aws_iam_role_policy" "emr_trigger" {
   name  = "emr-trigger"
   role  = aws_iam_role.lambda["orchestrator"].id
 
+  lifecycle {
+    precondition {
+      condition     = length(var.emr_cluster_arns) > 0
+      error_message = "emr_cluster_arns must be non-empty when enable_emr_trigger is true."
+    }
+  }
+
   policy = jsonencode({
     Version = "2012-10-17"
     Statement = [{
       Effect   = "Allow"
       Action   = ["elasticmapreduce:AddJobFlowSteps", "elasticmapreduce:DescribeStep"]
-      Resource = "*"
+      Resource = var.emr_cluster_arns
     }]
   })
 }
@@ -597,12 +622,19 @@ resource "aws_iam_role_policy" "emr_serverless_trigger" {
   name  = "emr-serverless-trigger"
   role  = aws_iam_role.lambda["orchestrator"].id
 
+  lifecycle {
+    precondition {
+      condition     = length(var.emr_serverless_app_arns) > 0
+      error_message = "emr_serverless_app_arns must be non-empty when enable_emr_serverless_trigger is true."
+    }
+  }
+
   policy = jsonencode({
     Version = "2012-10-17"
     Statement = [{
       Effect   = "Allow"
       Action   = ["emr-serverless:StartJobRun", "emr-serverless:GetJobRun"]
-      Resource = "*"
+      Resource = var.emr_serverless_app_arns
     }]
   })
 }
@@ -614,12 +646,19 @@ resource "aws_iam_role_policy" "sfn_trigger" {
   name  = "sfn-trigger"
   role  = aws_iam_role.lambda["orchestrator"].id
 
+  lifecycle {
+    precondition {
+      condition     = length(var.sfn_trigger_arns) > 0
+      error_message = "sfn_trigger_arns must be non-empty when enable_sfn_trigger is true."
+    }
+  }
+
   policy = jsonencode({
     Version = "2012-10-17"
     Statement = [{
       Effect   = "Allow"
       Action   = ["states:StartExecution", "states:DescribeExecution"]
-      Resource = "*"
+      Resource = var.sfn_trigger_arns
     }]
   })
 }
@@ -631,6 +670,13 @@ resource "aws_iam_role_policy" "lambda_trigger" {
   name  = "lambda-trigger"
   role  = aws_iam_role.lambda["orchestrator"].id
 
+  lifecycle {
+    precondition {
+      condition     = length(var.lambda_trigger_arns) > 0
+      error_message = "lambda_trigger_arns must be non-empty when enable_lambda_trigger is true."
+    }
+  }
+
   policy = jsonencode({
     Version = "2012-10-17"
     Statement = [{
diff --git a/deploy/terraform/variables.tf b/deploy/terraform/variables.tf
index 8853cac..eb7615a 100644
--- a/deploy/terraform/variables.tf
+++ b/deploy/terraform/variables.tf
@@ -139,5 +139,29 @@ variable "enable_lambda_trigger" {
 variable "lambda_trigger_arns" {
   description = "ARNs of Lambda functions the orchestrator may invoke as pipeline triggers"
   type        = list(string)
-  default     = ["*"]
+  default     = []
+}
+
+variable "glue_job_arns" {
+  description = "ARNs of Glue jobs that the orchestrator Lambda can start. Required when enable_glue_trigger is true."
+  type        = list(string)
+  default     = []
+}
+
+variable "emr_cluster_arns" {
+  description = "ARNs of EMR clusters the orchestrator can submit steps to. Required when enable_emr_trigger is true."
+  type        = list(string)
+  default     = []
+}
+
+variable "emr_serverless_app_arns" {
+  description = "ARNs of EMR Serverless applications. Required when enable_emr_serverless_trigger is true."
+  type        = list(string)
+  default     = []
+}
+
+variable "sfn_trigger_arns" {
+  description = "ARNs of Step Functions the orchestrator can start. Required when enable_sfn_trigger is true."
+  type        = list(string)
+  default     = []
 }
diff --git a/internal/lambda/drift.go b/internal/lambda/drift.go
new file mode 100644
index 0000000..7088aa6
--- /dev/null
+++ b/internal/lambda/drift.go
@@ -0,0 +1,63 @@
+package lambda
+
+import (
+	"math"
+	"strconv"
+)
+
+// ExtractFloatOk retrieves a numeric value from a sensor data map.
+// Returns (value, true) if the key exists and is numeric, (0, false) otherwise.
+// Unlike ExtractFloat, this distinguishes zero values from missing keys.
+func ExtractFloatOk(data map[string]interface{}, key string) (float64, bool) {
+	if data == nil {
+		return 0, false
+	}
+	v, ok := data[key]
+	if !ok {
+		return 0, false
+	}
+	switch n := v.(type) {
+	case float64:
+		return n, true
+	case string:
+		f, err := strconv.ParseFloat(n, 64)
+		if err != nil {
+			return 0, false
+		}
+		return f, true
+	default:
+		return 0, false
+	}
+}
+
+// DriftResult holds the outcome of a drift comparison.
+type DriftResult struct {
+	Drifted   bool
+	Previous  float64
+	Current   float64
+	Delta     float64
+	PrevFound bool
+	CurrFound bool
+}
+
+// DetectDrift compares baseline and current sensor data for a drift field.
+// Both values must be present for drift to be detected. Returns whether
+// the absolute delta exceeds the threshold.
+func DetectDrift(baseline, current map[string]interface{}, driftField string, threshold float64) DriftResult {
+	prev, prevOk := ExtractFloatOk(baseline, driftField)
+	curr, currOk := ExtractFloatOk(current, driftField)
+
+	result := DriftResult{
+		Previous:  prev,
+		Current:   curr,
+		PrevFound: prevOk,
+		CurrFound: currOk,
+	}
+
+	if prevOk && currOk {
+		result.Delta = curr - prev
+		result.Drifted = math.Abs(result.Delta) > threshold
+	}
+
+	return result
+}
diff --git a/internal/lambda/drift_test.go b/internal/lambda/drift_test.go
new file mode 100644
index 0000000..936234e
--- /dev/null
+++ b/internal/lambda/drift_test.go
@@ -0,0 +1,62 @@
+package lambda
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestExtractFloatOk(t *testing.T) {
+	tests := []struct {
+		name    string
+		data    map[string]interface{}
+		key     string
+		wantVal float64
+		wantOk  bool
+	}{
+		{"present float", map[string]interface{}{"count": float64(42)}, "count", 42, true},
+		{"present zero", map[string]interface{}{"count": float64(0)}, "count", 0, true},
+		{"present string", map[string]interface{}{"count": "123.5"}, "count", 123.5, true},
+		{"missing key", map[string]interface{}{}, "count", 0, false},
+		{"nil map", nil, "count", 0, false},
+		{"wrong type", map[string]interface{}{"count": true}, "count", 0, false},
+		{"invalid string", map[string]interface{}{"count": "abc"}, "count", 0, false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			val, ok := ExtractFloatOk(tt.data, tt.key)
+			assert.Equal(t, tt.wantOk, ok)
+			assert.InDelta(t, tt.wantVal, val, 0.001)
+		})
+	}
+}
+
+func TestDetectDrift(t *testing.T) {
+	m := func(k string, v float64) map[string]interface{} {
+		return map[string]interface{}{k: v}
+	}
+	tests := []struct {
+		name      string
+		baseline  map[string]interface{}
+		current   map[string]interface{}
+		field     string
+		threshold float64
+		wantDrift bool
+	}{
+		{"5000→0 drifts", m("count", 5000.0), m("count", 0.0), "count", 0, true},
+		{"0→5000 drifts", m("count", 0.0), m("count", 5000.0), "count", 0, true},
+		{"same value no drift", m("count", 100.0), m("count", 100.0), "count", 0, false},
+		{"within threshold", m("count", 100.0), m("count", 150.0), "count", 100, false},
+		{"exceeds threshold", m("count", 100.0), m("count", 250.0), "count", 100, true},
+		{"prev missing no drift", map[string]interface{}{}, m("count", 100.0), "count", 0, false},
+		{"curr missing no drift", m("count", 100.0), map[string]interface{}{}, "count", 0, false},
+		{"both missing no drift", map[string]interface{}{}, map[string]interface{}{}, "count", 0, false},
+		{"negative drift", m("count", 100.0), m("count", 50.0), "count", 0, true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := DetectDrift(tt.baseline, tt.current, tt.field, tt.threshold)
+			assert.Equal(t, tt.wantDrift, result.Drifted)
+		})
+	}
+}
diff --git a/internal/lambda/dryrun.go b/internal/lambda/dryrun.go
index d9a716b..eff6514 100644
--- a/internal/lambda/dryrun.go
+++ b/internal/lambda/dryrun.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"math"
+	"strings"
 	"time"
 
 	"github.com/dwsmith1983/interlock/internal/validation"
@@ -227,22 +228,34 @@ func handleDryRunPostRunSensor(ctx context.Context, d *Deps, cfg *types.Pipeline
 		return nil
 	}
 
+	// Find matching post-run rule for this sensor key.
+	var ruleBaseline map[string]interface{}
+	for _, rule := range cfg.PostRun.Rules {
+		if strings.HasPrefix(sensorKey, rule.Key) {
+			if nested, ok := baseline[rule.Key].(map[string]interface{}); ok {
+				ruleBaseline = nested
+			}
+			break
+		}
+	}
+	if ruleBaseline == nil {
+		return nil // No baseline for this rule (stale or first run).
+	}
+
 	// Compare drift.
 	driftField := resolveDriftField(cfg.PostRun)
-	prevCount := ExtractFloat(baseline, driftField)
-	currCount := ExtractFloat(sensorData, driftField)
 	threshold := 0.0
 	if cfg.PostRun.DriftThreshold != nil {
 		threshold = *cfg.PostRun.DriftThreshold
 	}
-
-	if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold {
+	dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold)
+	if dr.Drifted {
 		if pubErr := publishEvent(ctx, d, string(types.EventDryRunDrift), pipelineID, scheduleID, date,
-			fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, prevCount, currCount),
+			fmt.Sprintf("dry-run: drift detected for %s: %.0f → %.0f — would re-run", pipelineID, dr.Previous, dr.Current),
 			map[string]interface{}{
-				"previousCount":  prevCount,
-				"currentCount":   currCount,
-				"delta":          currCount - prevCount,
+				"previousCount":  dr.Previous,
+				"currentCount":   dr.Current,
+				"delta":          dr.Delta,
 				"driftThreshold": threshold,
 				"driftField":     driftField,
 				"sensorKey":      sensorKey,
diff --git a/internal/lambda/dynstream.go b/internal/lambda/dynstream.go
index 54ce702..5d45ae1 100644
--- a/internal/lambda/dynstream.go
+++ b/internal/lambda/dynstream.go
@@ -161,7 +161,7 @@ func publishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule,
 	source := types.EventSource
 	detailStr := string(detailJSON)
 
-	_, err = d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{
+	out, err := d.EventBridge.PutEvents(ctx, &eventbridge.PutEventsInput{
 		Entries: []ebTypes.PutEventsRequestEntry{
 			{
 				Source:       &source,
@@ -174,6 +174,16 @@ func publishEvent(ctx context.Context, d *Deps, eventType, pipelineID, schedule,
 	if err != nil {
 		return fmt.Errorf("publish %s event: %w", eventType, err)
 	}
+	if out.FailedEntryCount > 0 {
+		code, msg := "", ""
+		if len(out.Entries) > 0 && out.Entries[0].ErrorCode != nil {
+			code = *out.Entries[0].ErrorCode
+			if out.Entries[0].ErrorMessage != nil {
+				msg = *out.Entries[0].ErrorMessage
+			}
+		}
+		return fmt.Errorf("publish %s event: partial failure (code=%s, message=%s)", eventType, code, msg)
+	}
 	return nil
 }
 
diff --git a/internal/lambda/dynstream_test.go b/internal/lambda/dynstream_test.go
new file mode 100644
index 0000000..73ebeaa
--- /dev/null
+++ b/internal/lambda/dynstream_test.go
@@ -0,0 +1,49 @@
+package lambda
+
+import (
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/aws/aws-sdk-go-v2/service/eventbridge"
+	ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types"
+)
+
+// testEventBridge is a local EventBridgeAPI implementation for white-box tests.
+type testEventBridge struct {
+	failedEntryCount int32
+}
+
+func (t *testEventBridge) PutEvents(_ context.Context, _ *eventbridge.PutEventsInput, _ ...func(*eventbridge.Options)) (*eventbridge.PutEventsOutput, error) {
+	if t.failedEntryCount > 0 {
+		errCode := "InternalError"
+		errMsg := "simulated partial failure"
+		return &eventbridge.PutEventsOutput{
+			FailedEntryCount: t.failedEntryCount,
+			Entries: []ebTypes.PutEventsResultEntry{
+				{ErrorCode: &errCode, ErrorMessage: &errMsg},
+			},
+		}, nil
+	}
+	return &eventbridge.PutEventsOutput{}, nil
+}
+
+func TestPublishEvent_PartialFailure(t *testing.T) {
+	d := &Deps{
+		EventBridge:  &testEventBridge{failedEntryCount: 1},
+		EventBusName: "test-bus",
+		NowFunc:      func() time.Time { return time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) },
+	}
+
+	err := publishEvent(context.Background(), d, "test.event", "pipeline-1", "cron", "2025-01-01", "test message")
+	if err == nil {
+		t.Fatal("expected error for partial failure, got nil")
+	}
+	if !strings.Contains(err.Error(), "partial failure") {
+		t.Errorf("expected error to contain 'partial failure', got: %s", err.Error())
+	}
+	if !strings.Contains(err.Error(), "InternalError") {
+		t.Errorf("expected error to contain 'InternalError', got: %s", err.Error())
+	}
+}
diff --git a/internal/lambda/e2e_test.go b/internal/lambda/e2e_test.go
index 7f66514..b32008d 100644
--- a/internal/lambda/e2e_test.go
+++ b/internal/lambda/e2e_test.go
@@ -276,7 +276,7 @@ func runSFN(t *testing.T, ctx context.Context, d *lambda.Deps, mock *mockDDB, eb
 						// Simulate stream event for each sensor update.
 						sensorRecord := makeSensorRecord(pid, key, toStreamAttributes(data))
 						streamEvt := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{sensorRecord}}
-						_ = lambda.HandleStreamEvent(ctx, d, streamEvt)
+						_, _ = lambda.HandleStreamEvent(ctx, d, streamEvt)
 					}
 				}
 			}
@@ -1021,7 +1021,7 @@ func TestE2E_AutoRetries(t *testing.T) {
 		require.NotEmpty(t, jobSK, "should have a joblog entry")
 
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d1", jobSK, "fail"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d1", jobSK, "fail"))
 		require.NoError(t, err)
 
 		// Verify: new SFN execution started (auto-retry under maxRetries limit)
@@ -1055,7 +1055,7 @@ func TestE2E_AutoRetries(t *testing.T) {
 		eb.events = nil
 		eb.mu.Unlock()
 
-		err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d2", jobSK, "fail"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-d2", jobSK, "fail"))
 		require.NoError(t, err)
 
 		// Verify: no new SFN, RETRY_EXHAUSTED published, status=FAILED_FINAL
@@ -1104,7 +1104,7 @@ func TestE2E_FailureClassification(t *testing.T) {
 		eb.events = nil
 		eb.mu.Unlock()
 
-		err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc1", jobSK, "fail"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc1", jobSK, "fail"))
 		require.NoError(t, err)
 
 		// Verify: no retry (MaxCodeRetries=0), RETRY_EXHAUSTED event
@@ -1140,7 +1140,7 @@ func TestE2E_FailureClassification(t *testing.T) {
 		require.NotEmpty(t, jobSK)
 
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc2", jobSK, "fail"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-fc2", jobSK, "fail"))
 		require.NoError(t, err)
 
 		sfnM.mu.Lock()
@@ -1184,7 +1184,7 @@ func TestE2E_RerunReplay(t *testing.T) {
 
 		// Process RERUN_REQUEST stream event
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e1"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e1"))
 		require.NoError(t, err)
 
 		// Verify: new SFN started, rerun-accepted joblog written
@@ -1221,7 +1221,7 @@ func TestE2E_RerunReplay(t *testing.T) {
 		}))
 
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e2"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-e2"))
 		require.NoError(t, err)
 
 		// Verify: no SFN, RERUN_REJECTED published
@@ -1257,7 +1257,7 @@ func TestE2E_RerunReplay(t *testing.T) {
 			"status": events.NewStringAttribute("ready"),
 			"date":   events.NewStringAttribute("2026-03-07"),
 		})
-		err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
+		_, err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
 		require.NoError(t, err)
 
 		// Lock already held → late data path
@@ -1306,7 +1306,7 @@ func TestE2E_DriftRetrigger(t *testing.T) {
 
 		// Phase 2: Stream-router processes RERUN_REQUEST
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f1"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f1"))
 		require.NoError(t, err)
 
 		// Verify: new SFN started for re-trigger
@@ -1343,7 +1343,7 @@ func TestE2E_DriftRetrigger(t *testing.T) {
 		assert.Contains(t, r.events, "POST_RUN_DRIFT")
 
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f2"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f2"))
 		require.NoError(t, err)
 
 		sfnM.mu.Lock()
@@ -1380,7 +1380,7 @@ func TestE2E_DriftRetrigger(t *testing.T) {
 
 		// Phase 2: verify the RERUN_REQUEST was written, allowing re-trigger
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f3"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-f3"))
 		require.NoError(t, err)
 
 		sfnM.mu.Lock()
@@ -1432,7 +1432,7 @@ func TestE2E_RerunLimits(t *testing.T) {
 
 		// Send a data-drift RERUN_REQUEST — should be rejected
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl1", "data-drift"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl1", "data-drift"))
 		require.NoError(t, err)
 
 		// Verify: no SFN started, RERUN_REJECTED event + joblog entry
@@ -1480,7 +1480,7 @@ func TestE2E_RerunLimits(t *testing.T) {
 		// Send a late-data RERUN_REQUEST — should be rejected because
 		// late-data shares the drift budget (count 1 >= budget 1)
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl2", "late-data"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-rl2", "late-data"))
 		require.NoError(t, err)
 
 		// Verify: no SFN started, RERUN_REJECTED event + joblog entry
@@ -1791,7 +1791,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) {
 		})
 
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
+		_, err := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
 		require.NoError(t, err)
 
 		// Verify: SFN started, trigger lock acquired, JOB_TRIGGERED event published.
@@ -1824,7 +1824,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) {
 		})
 
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-i2", jobSK, types.JobEventTimeout))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeJobStreamEvent("pipe-i2", jobSK, types.JobEventTimeout))
 		require.NoError(t, err)
 
 		// Verify: auto-retry started (timeout is retryable just like fail).
@@ -1861,7 +1861,7 @@ func TestE2E_StreamRouterEntryPoints(t *testing.T) {
 		}))
 
 		sfnCountBefore := len(sfnM.executions)
-		err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-i3"))
+		_, err := lambda.HandleStreamEvent(ctx, d, makeRerunRequestStreamEvent("pipe-i3"))
 		require.NoError(t, err)
 
 		// Verify: rerun accepted despite old sensor data (failure skips freshness check).
@@ -2067,32 +2067,37 @@ func TestE2E_RerunBudgetSeparation(t *testing.T) {
 
 		// Phase 1: First drift rerun — accepted (0 < budget 1).
 		sfnBefore := countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift")))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift"))
+		require.NoError(t, handleErr)
 		assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first drift rerun should start SFN")
 
 		// Phase 2: Second drift rerun — rejected (1 >= budget 1).
 		resetEventBridge(eb)
 		sfnBefore = countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift")))
+		_, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "data-drift"))
+		require.NoError(t, handleErr)
 		assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "second drift rerun should NOT start SFN")
 		assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED")
 
 		// Phase 3: First manual rerun — accepted despite drift budget exhausted.
 		resetEventBridge(eb)
 		sfnBefore = countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")))
+		_, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))
+		require.NoError(t, handleErr)
 		assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first manual rerun should succeed")
 
 		// Phase 4: Second manual rerun — accepted (1 < budget 2).
 		resetEventBridge(eb)
 		sfnBefore = countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")))
+		_, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))
+		require.NoError(t, handleErr)
 		assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "second manual rerun should succeed")
 
 		// Phase 5: Third manual rerun — rejected (2 >= budget 2).
 		resetEventBridge(eb)
 		sfnBefore = countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual")))
+		_, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-bs1", "manual"))
+		require.NoError(t, handleErr)
 		assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "third manual rerun should NOT start SFN")
 		assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED")
 		assertAlertFormats(t, eb)
@@ -2119,16 +2124,17 @@ func TestE2E_PostRunInflight(t *testing.T) {
 		seedConfig(mock, cfg)
 		seedTriggerWithStatus(mock, "pipe-inf1", "2026-03-07", types.TriggerStatusRunning)
 
-		// Baseline from a previous run.
+		// Baseline from a previous run (namespaced by rule key).
 		require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf1", "postrun-baseline#2026-03-07",
-			map[string]interface{}{"sensor_count": float64(100)}))
+			map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}}))
 
 		// Sensor arrives with different count while job is running.
 		record := makeSensorRecord("pipe-inf1", "audit-result", toStreamAttributes(map[string]interface{}{
 			"sensor_count": float64(200),
 			"date":         "2026-03-07",
 		}))
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
+		require.NoError(t, handleErr)
 
 		assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT_INFLIGHT")
 		assert.False(t, hasRerunRequest(mock, "pipe-inf1"), "should NOT write rerun request while running")
@@ -2150,16 +2156,17 @@ func TestE2E_PostRunInflight(t *testing.T) {
 		seedConfig(mock, cfg)
 		seedTriggerWithStatus(mock, "pipe-inf-cf", "2026-03-07", types.TriggerStatusRunning)
 
-		// Baseline uses custom field "count".
+		// Baseline uses custom field "count" (namespaced by rule key).
 		require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf-cf", "postrun-baseline#2026-03-07",
-			map[string]interface{}{"count": float64(100)}))
+			map[string]interface{}{"audit-result": map[string]interface{}{"count": float64(100)}}))
 
 		// Sensor arrives with different count while job is running.
 		record := makeSensorRecord("pipe-inf-cf", "audit-result", toStreamAttributes(map[string]interface{}{
 			"count": float64(200),
 			"date":  "2026-03-07",
 		}))
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
+		require.NoError(t, handleErr)
 
 		assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT_INFLIGHT")
 		assert.False(t, hasRerunRequest(mock, "pipe-inf-cf"), "should NOT write rerun request while running")
@@ -2180,15 +2187,16 @@ func TestE2E_PostRunInflight(t *testing.T) {
 		seedConfig(mock, cfg)
 		seedTriggerWithStatus(mock, "pipe-inf2", "2026-03-07", types.TriggerStatusRunning)
 
-		// Baseline matches incoming sensor — no drift.
+		// Baseline matches incoming sensor — no drift (namespaced by rule key).
 		require.NoError(t, d.Store.WriteSensor(ctx, "pipe-inf2", "postrun-baseline#2026-03-07",
-			map[string]interface{}{"sensor_count": float64(100)}))
+			map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}}))
 
 		record := makeSensorRecord("pipe-inf2", "audit-result", toStreamAttributes(map[string]interface{}{
 			"sensor_count": float64(100),
 			"date":         "2026-03-07",
 		}))
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
+		require.NoError(t, handleErr)
 
 		assert.Empty(t, collectEventTypes(eb))
 		assert.Equal(t, 0, countSFNExecutions(sfnM))
@@ -2217,7 +2225,8 @@ func TestE2E_CalendarExclusionFullSkip(t *testing.T) {
 
 		record := makeSensorRecord("pipe-cal1", "upstream-complete",
 			map[string]events.DynamoDBAttributeValue{"status": events.NewStringAttribute("ready")})
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
+		require.NoError(t, handleErr)
 
 		assert.Equal(t, 0, countSFNExecutions(sfnM))
 		assertNoTriggerLock(t, mock, "pipe-cal1", "stream", today)
@@ -2266,7 +2275,8 @@ func TestE2E_HourBoundaryRollover(t *testing.T) {
 			"date":   "20260307",
 			"hour":   "23",
 		}))
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record23}}))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record23}})
+		require.NoError(t, handleErr)
 
 		// Hour 00 (next day) sensor arrives.
 		record00 := makeSensorRecord("pipe-hr1", "hourly-status#20260308T00", toStreamAttributes(map[string]interface{}{
@@ -2274,7 +2284,8 @@ func TestE2E_HourBoundaryRollover(t *testing.T) {
 			"date":   "20260308",
 			"hour":   "00",
 		}))
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record00}}))
+		_, handleErr = lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record00}})
+		require.NoError(t, handleErr)
 
 		// Two independent SFN executions.
 		sfnM.mu.Lock()
@@ -2317,28 +2328,31 @@ func TestE2E_ConcurrentDriftDedup(t *testing.T) {
 		seedConfig(mock, cfg)
 		seedCompletedPipelineE2E(t, ctx, d, mock, "pipe-cd1", "2026-03-07")
 
-		// Baseline captured at completion.
+		// Baseline captured at completion (namespaced by rule key).
 		require.NoError(t, d.Store.WriteSensor(ctx, "pipe-cd1", "postrun-baseline#2026-03-07",
-			map[string]interface{}{"sensor_count": float64(100)}))
+			map[string]interface{}{"audit-result": map[string]interface{}{"sensor_count": float64(100)}}))
 
 		// First drift sensor arrives.
 		record1 := makeSensorRecord("pipe-cd1", "audit-result", toStreamAttributes(map[string]interface{}{
 			"sensor_count": float64(200),
 			"date":         "2026-03-07",
 		}))
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1}}))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1}})
+		require.NoError(t, handleErr)
 		assert.Contains(t, collectEventTypes(eb), "POST_RUN_DRIFT")
 
 		// Process first rerun request — accepted.
 		resetEventBridge(eb)
 		sfnBefore := countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift")))
+		_, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift"))
+		require.NoError(t, handleErr)
 		assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "first drift rerun accepted")
 
 		// Process second rerun request — rejected (budget exhausted).
 		resetEventBridge(eb)
 		sfnBefore = countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift")))
+		_, handleErr = lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-cd1", "data-drift"))
+		require.NoError(t, handleErr)
 		assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "second drift rerun rejected")
 		assert.Contains(t, collectEventTypes(eb), "RERUN_REJECTED")
 		assertAlertFormats(t, eb)
@@ -2370,7 +2384,8 @@ func TestE2E_PostRunBeforeBaseline(t *testing.T) {
 			"sensor_count": float64(500),
 			"date":         "2026-03-07",
 		}))
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}})
+		require.NoError(t, handleErr)
 
 		assert.Empty(t, collectEventTypes(eb), "should not publish any event when baseline is missing")
 		assert.False(t, hasRerunRequest(mock, "pipe-nb1"))
@@ -2412,7 +2427,8 @@ func TestE2E_RerunAfterTriggerTTLExpiry(t *testing.T) {
 		}))
 
 		sfnBefore := countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl1", "manual")))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl1", "manual"))
+		require.NoError(t, handleErr)
 		assert.Equal(t, sfnBefore, countSFNExecutions(sfnM), "no SFN when trigger lock row was deleted by TTL")
 
 		// Should have published an INFRA_FAILURE event.
@@ -2457,7 +2473,8 @@ func TestE2E_RerunAfterTriggerTTLExpiry(t *testing.T) {
 		})
 
 		sfnBefore := countSFNExecutions(sfnM)
-		require.NoError(t, lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl2", "manual")))
+		_, handleErr := lambda.HandleStreamEvent(ctx, d, makeRerunRequestWithReasonE2E("pipe-ttl2", "manual"))
+		require.NoError(t, handleErr)
 		assert.Greater(t, countSFNExecutions(sfnM), sfnBefore, "rerun should start SFN when trigger lock exists")
 		assertAlertFormats(t, eb)
 	})
diff --git a/internal/lambda/export_test.go b/internal/lambda/export_test.go
index 738c420..3c5f829 100644
--- a/internal/lambda/export_test.go
+++ b/internal/lambda/export_test.go
@@ -3,8 +3,16 @@
 // even to files in the non-_test package when placed here).
 package lambda
 
-import "github.com/dwsmith1983/interlock/pkg/types"
+import (
+	"time"
+
+	"github.com/dwsmith1983/interlock/pkg/types"
+)
 
 // IsExcludedDate re-exports isExcludedDate for white-box unit testing from
 // the external test package (package lambda_test).
 var IsExcludedDate func(cfg *types.PipelineConfig, dateStr string) bool = isExcludedDate
+
+// ResolveTriggerDeadlineTime re-exports resolveTriggerDeadlineTime for
+// white-box unit testing from the external test package (package lambda_test).
+var ResolveTriggerDeadlineTime func(deadline, date, timezone string) time.Time = resolveTriggerDeadlineTime
diff --git a/internal/lambda/mock_test.go b/internal/lambda/mock_test.go
index 29f4c5a..12e2daa 100644
--- a/internal/lambda/mock_test.go
+++ b/internal/lambda/mock_test.go
@@ -9,6 +9,7 @@ import (
 	"github.com/aws/aws-sdk-go-v2/service/dynamodb"
 	ddbtypes "github.com/aws/aws-sdk-go-v2/service/dynamodb/types"
 	"github.com/aws/aws-sdk-go-v2/service/eventbridge"
+	ebTypes "github.com/aws/aws-sdk-go-v2/service/eventbridge/types"
 	"github.com/aws/aws-sdk-go-v2/service/scheduler"
 	"github.com/aws/aws-sdk-go-v2/service/sfn"
 
@@ -40,9 +41,10 @@ func (m *mockSFN) StartExecution(_ context.Context, input *sfn.StartExecutionInp
 // ---------------------------------------------------------------------------
 
 type mockEventBridge struct {
-	mu     sync.Mutex
-	events []*eventbridge.PutEventsInput
-	err    error
+	mu               sync.Mutex
+	events           []*eventbridge.PutEventsInput
+	err              error
+	failedEntryCount int32
 }
 
 func (m *mockEventBridge) PutEvents(_ context.Context, input *eventbridge.PutEventsInput, _ ...func(*eventbridge.Options)) (*eventbridge.PutEventsOutput, error) {
@@ -52,6 +54,16 @@ func (m *mockEventBridge) PutEvents(_ context.Context, input *eventbridge.PutEve
 		return nil, m.err
 	}
 	m.events = append(m.events, input)
+	if m.failedEntryCount > 0 {
+		errCode := "InternalError"
+		errMsg := "simulated partial failure"
+		return &eventbridge.PutEventsOutput{
+			FailedEntryCount: m.failedEntryCount,
+			Entries: []ebTypes.PutEventsResultEntry{
+				{ErrorCode: &errCode, ErrorMessage: &errMsg},
+			},
+		}, nil
+	}
 	return &eventbridge.PutEventsOutput{}, nil
 }
 
diff --git a/internal/lambda/orchestrator.go b/internal/lambda/orchestrator.go
index 5ba53ab..3035d90 100644
--- a/internal/lambda/orchestrator.go
+++ b/internal/lambda/orchestrator.go
@@ -263,15 +263,19 @@ func RemapPerPeriodSensors(sensors map[string]map[string]interface{}, date strin
 	if compact != date {
 		suffixes = append(suffixes, "#"+compact)
 	}
+	additions := make(map[string]map[string]interface{})
 	for key, data := range sensors {
 		for _, suffix := range suffixes {
 			if strings.HasSuffix(key, suffix) {
 				base := strings.TrimSuffix(key, suffix)
-				sensors[base] = data
+				additions[base] = data
 				break
 			}
 		}
 	}
+	for k, v := range additions {
+		sensors[k] = v
+	}
 }
 
 // handleTriggerExhausted publishes RETRY_EXHAUSTED when trigger retries are
@@ -357,13 +361,12 @@ func capturePostRunBaseline(ctx context.Context, d *Deps, pipelineID, scheduleID
 
 	RemapPerPeriodSensors(sensors, date)
 
-	// Build baseline from post-run rule keys.
+	// Build baseline from post-run rule keys, namespaced by rule key
+	// to prevent field name collisions between different sensors.
 	baseline := make(map[string]interface{})
 	for _, rule := range cfg.PostRun.Rules {
 		if data, ok := sensors[rule.Key]; ok {
-			for k, v := range data {
-				baseline[k] = v
-			}
+			baseline[rule.Key] = data
 		}
 	}
 
@@ -498,7 +501,7 @@ func InjectDateArgs(tc *types.TriggerConfig, date string) {
 		if hourPart != "" {
 			payload["par_hour"] = hourPart
 		}
-		b, _ := json.Marshal(payload)
+		b, _ := json.Marshal(payload) // json.Marshal is infallible for map[string]string (no channels, funcs, or complex types)
 		tc.HTTP.Body = string(b)
 	}
 }
diff --git a/internal/lambda/orchestrator_unit_test.go b/internal/lambda/orchestrator_unit_test.go
index d6ce194..cd5d343 100644
--- a/internal/lambda/orchestrator_unit_test.go
+++ b/internal/lambda/orchestrator_unit_test.go
@@ -93,6 +93,49 @@ func TestInjectDateArgs(t *testing.T) {
 	})
 }
 
+// ---------------------------------------------------------------------------
+// BUG-2 characterization: RemapPerPeriodSensors map mutation during range
+// ---------------------------------------------------------------------------
+
+func TestRemapPerPeriodSensors_MultipleSuffixes_MapMutation(t *testing.T) {
+	// BUG-2 characterization: adding keys during range iteration.
+	// With Go's map iteration, newly inserted keys may or may not be visited.
+	// This test documents the nondeterministic behavior.
+	sensors := map[string]map[string]interface{}{
+		"hourly-status#2026-03-13": {"count": float64(10)},
+		"daily-check#2026-03-13":   {"count": float64(20)},
+		"weekly-scan#20260313":     {"count": float64(30)},
+	}
+	lambda.RemapPerPeriodSensors(sensors, "2026-03-13")
+	// All base keys should be present
+	assert.NotNil(t, sensors["hourly-status"], "hourly-status base key should exist")
+	assert.NotNil(t, sensors["daily-check"], "daily-check base key should exist")
+	assert.NotNil(t, sensors["weekly-scan"], "weekly-scan base key should exist")
+}
+
+func TestRemapPerPeriodSensors_StagedMerge_NoCrossContamination(t *testing.T) {
+	// Verify staged merge doesn't allow newly-added base keys to match
+	// as suffixed keys in the same iteration.
+	sensors := map[string]map[string]interface{}{
+		"hourly-status#2026-03-13": {"count": float64(10)},
+	}
+	lambda.RemapPerPeriodSensors(sensors, "2026-03-13")
+	assert.NotNil(t, sensors["hourly-status"])
+	assert.Equal(t, float64(10), sensors["hourly-status"]["count"])
+	// Original suffixed key should still exist
+	assert.NotNil(t, sensors["hourly-status#2026-03-13"])
+}
+
+// ---------------------------------------------------------------------------
+// BUG-10 characterization: baseline flattening collision
+// ---------------------------------------------------------------------------
+
+func TestExtractFloat_ZeroValueIndistinguishableFromMissing(t *testing.T) {
+	// BUG-1 characterization: ExtractFloat returns 0 for both zero and missing.
+	assert.Equal(t, float64(0), lambda.ExtractFloat(map[string]interface{}{"count": float64(0)}, "count"))
+	assert.Equal(t, float64(0), lambda.ExtractFloat(map[string]interface{}{}, "count"))
+}
+
 // ---------------------------------------------------------------------------
 // RemapPerPeriodSensors — table-driven
 // ---------------------------------------------------------------------------
diff --git a/internal/lambda/postrun.go b/internal/lambda/postrun.go
index 7b8beae..727badd 100644
--- a/internal/lambda/postrun.go
+++ b/internal/lambda/postrun.go
@@ -3,7 +3,6 @@ package lambda
 import (
 	"context"
 	"fmt"
-	"math"
 	"strings"
 
 	"github.com/dwsmith1983/interlock/internal/validation"
@@ -61,7 +60,7 @@ func handlePostRunSensorEvent(ctx context.Context, d *Deps, cfg *types.PipelineC
 
 	case types.TriggerStatusCompleted:
 		// Job completed — full post-run evaluation with baseline comparison.
-		return handlePostRunCompleted(ctx, d, cfg, pipelineID, scheduleID, date, sensorData)
+		return handlePostRunCompleted(ctx, d, cfg, pipelineID, scheduleID, date, sensorKey, sensorData)
 
 	default:
 		// FAILED_FINAL or unknown — skip.
@@ -82,20 +81,33 @@ func handlePostRunInflight(ctx context.Context, d *Deps, cfg *types.PipelineConf
 		return nil // No baseline yet — job hasn't completed once.
 	}
 
+	// Find matching post-run rule for this sensor key.
+	var ruleBaseline map[string]interface{}
+	for _, rule := range cfg.PostRun.Rules {
+		if strings.HasPrefix(sensorKey, rule.Key) {
+			if nested, ok := baseline[rule.Key].(map[string]interface{}); ok {
+				ruleBaseline = nested
+			}
+			break
+		}
+	}
+	if ruleBaseline == nil {
+		return nil // No baseline for this rule (stale or first run).
+	}
+
 	driftField := resolveDriftField(cfg.PostRun)
-	prevCount := ExtractFloat(baseline, driftField)
-	currCount := ExtractFloat(sensorData, driftField)
 	threshold := 0.0
 	if cfg.PostRun.DriftThreshold != nil {
 		threshold = *cfg.PostRun.DriftThreshold
 	}
-	if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold {
+	dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold)
+	if dr.Drifted {
 		if err := publishEvent(ctx, d, string(types.EventPostRunDriftInflight), pipelineID, scheduleID, date,
-			fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, prevCount, currCount),
+			fmt.Sprintf("inflight drift detected for %s: %.0f → %.0f (informational)", pipelineID, dr.Previous, dr.Current),
 			map[string]interface{}{
-				"previousCount":  prevCount,
-				"currentCount":   currCount,
-				"delta":          currCount - prevCount,
+				"previousCount":  dr.Previous,
+				"currentCount":   dr.Current,
+				"delta":          dr.Delta,
 				"driftThreshold": threshold,
 				"driftField":     driftField,
 				"sensorKey":      sensorKey,
@@ -110,7 +122,7 @@ func handlePostRunInflight(ctx context.Context, d *Deps, cfg *types.PipelineConf
 // handlePostRunCompleted evaluates post-run rules after the job has completed.
 // Compares sensor values against the date-scoped baseline and triggers a rerun
 // if drift is detected.
-func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date string, sensorData map[string]interface{}) error {
+func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, scheduleID, date, sensorKey string, sensorData map[string]interface{}) error {
 	// Read baseline captured at trigger completion.
 	baselineKey := "postrun-baseline#" + date
 	baseline, err := d.Store.GetSensorData(ctx, pipelineID, baselineKey)
@@ -120,44 +132,56 @@ func handlePostRunCompleted(ctx context.Context, d *Deps, cfg *types.PipelineCon
 
 	// Check for data drift if baseline exists.
 	if baseline != nil {
-		driftField := resolveDriftField(cfg.PostRun)
-		prevCount := ExtractFloat(baseline, driftField)
-		currCount := ExtractFloat(sensorData, driftField)
-		threshold := 0.0
-		if cfg.PostRun.DriftThreshold != nil {
-			threshold = *cfg.PostRun.DriftThreshold
-		}
-		if prevCount > 0 && currCount > 0 && math.Abs(currCount-prevCount) > threshold {
-			delta := currCount - prevCount
-			if err := publishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date,
-				fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, prevCount, currCount),
-				map[string]interface{}{
-					"previousCount":  prevCount,
-					"currentCount":   currCount,
-					"delta":          delta,
-					"driftThreshold": threshold,
-					"driftField":     driftField,
-					"source":         "post-run-stream",
-				}); err != nil {
-				d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDrift, "error", err)
+		// Find matching post-run rule for this sensor key.
+		var ruleBaseline map[string]interface{}
+		for _, rule := range cfg.PostRun.Rules {
+			if strings.HasPrefix(sensorKey, rule.Key) {
+				if nested, ok := baseline[rule.Key].(map[string]interface{}); ok {
+					ruleBaseline = nested
+				}
+				break
 			}
+		}
 
-			// Trigger rerun via the existing circuit breaker path only if the
-			// execution date is not excluded by the pipeline's calendar config.
-			if isExcludedDate(cfg, date) {
-				if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date,
-					fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil {
-					d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr)
+		if ruleBaseline != nil {
+			driftField := resolveDriftField(cfg.PostRun)
+			threshold := 0.0
+			if cfg.PostRun.DriftThreshold != nil {
+				threshold = *cfg.PostRun.DriftThreshold
+			}
+			dr := DetectDrift(ruleBaseline, sensorData, driftField, threshold)
+			if dr.Drifted {
+				if err := publishEvent(ctx, d, string(types.EventPostRunDrift), pipelineID, scheduleID, date,
+					fmt.Sprintf("post-run drift detected for %s: %.0f → %.0f records", pipelineID, dr.Previous, dr.Current),
+					map[string]interface{}{
+						"previousCount":  dr.Previous,
+						"currentCount":   dr.Current,
+						"delta":          dr.Delta,
+						"driftThreshold": threshold,
+						"driftField":     driftField,
+						"sensorKey":      sensorKey,
+						"source":         "post-run-stream",
+					}); err != nil {
+					d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPostRunDrift, "error", err)
 				}
-				d.Logger.InfoContext(ctx, "post-run drift rerun skipped: execution date excluded by calendar",
-					"pipelineId", pipelineID, "date", date)
-			} else {
-				if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, scheduleID, date, "data-drift"); writeErr != nil {
-					d.Logger.WarnContext(ctx, "failed to write rerun request on post-run drift",
-						"pipelineId", pipelineID, "error", writeErr)
+
+				// Trigger rerun via the existing circuit breaker path only if the
+				// execution date is not excluded by the pipeline's calendar config.
+				if isExcludedDate(cfg, date) {
+					if pubErr := publishEvent(ctx, d, string(types.EventPipelineExcluded), pipelineID, scheduleID, date,
+						fmt.Sprintf("post-run drift rerun skipped for %s: execution date %s excluded by calendar", pipelineID, date)); pubErr != nil {
+						d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventPipelineExcluded, "error", pubErr)
+					}
+					d.Logger.InfoContext(ctx, "post-run drift rerun skipped: execution date excluded by calendar",
+						"pipelineId", pipelineID, "date", date)
+				} else {
+					if writeErr := d.Store.WriteRerunRequest(ctx, pipelineID, scheduleID, date, "data-drift"); writeErr != nil {
+						d.Logger.WarnContext(ctx, "failed to write rerun request on post-run drift",
+							"pipelineId", pipelineID, "error", writeErr)
+					}
 				}
+				return nil
 			}
-			return nil
 		}
 	}
 
diff --git a/internal/lambda/rerun.go b/internal/lambda/rerun.go
index 704b39f..cb94e76 100644
--- a/internal/lambda/rerun.go
+++ b/internal/lambda/rerun.go
@@ -34,11 +34,10 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even
 		return nil
 	}
 
-	// Dry-run pipelines never start real executions.
+	// Dry-run pipelines evaluate all checks but publish observation events
+	// instead of executing side effects.
 	if cfg.DryRun {
-		d.Logger.Info("dry-run: skipping rerun request",
-			"pipelineId", pipelineID, "schedule", schedule, "date", date)
-		return nil
+		return handleDryRunRerunRequest(ctx, d, cfg, pipelineID, schedule, date, record)
 	}
 
 	// --- Calendar exclusion check (execution date) ---
@@ -132,34 +131,38 @@ func handleRerunRequest(ctx context.Context, d *Deps, pk, sk string, record even
 		return nil
 	}
 
-	// --- Acceptance: write rerun record FIRST (before lock reset) ---
-	if _, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, reason, ""); err != nil {
-		return fmt.Errorf("write rerun for %q: %w", pipelineID, err)
-	}
-
-	// Delete date-scoped postrun-baseline so re-run captures fresh baseline.
-	if cfg.PostRun != nil {
-		if err := d.Store.DeleteSensor(ctx, pipelineID, "postrun-baseline#"+date); err != nil {
-			d.Logger.Warn("failed to delete postrun-baseline sensor", "error", err, "pipeline", pipelineID, "date", date)
-		}
-	}
-
-	// Atomically reset the trigger lock for the new execution.
+	// --- Acceptance: acquire lock FIRST (before writing rerun) ---
 	acquired, err := d.Store.ResetTriggerLock(ctx, pipelineID, schedule, date, ResolveTriggerLockTTL())
 	if err != nil {
 		return fmt.Errorf("reset trigger lock for %q: %w", pipelineID, err)
 	}
 	if !acquired {
 		if pubErr := publishEvent(ctx, d, string(types.EventInfraFailure), pipelineID, schedule, date,
-			fmt.Sprintf("lock reset failed for rerun of %s, orphaned rerun record", pipelineID)); pubErr != nil {
+			fmt.Sprintf("lock reset failed for rerun of %s", pipelineID)); pubErr != nil {
 			d.Logger.WarnContext(ctx, "failed to publish event", "error", pubErr)
 		}
-		d.Logger.Warn("failed to reset trigger lock, orphaned rerun record",
+		d.Logger.Warn("failed to reset trigger lock for rerun",
 			"pipelineId", pipelineID, "schedule", schedule, "date", date)
 		return nil
 	}
 
-	// Publish acceptance event only after lock atomicity is confirmed.
+	// Delete date-scoped postrun-baseline so re-run captures fresh baseline.
+	if cfg.PostRun != nil {
+		if err := d.Store.DeleteSensor(ctx, pipelineID, "postrun-baseline#"+date); err != nil {
+			d.Logger.Warn("failed to delete postrun-baseline sensor", "error", err, "pipeline", pipelineID, "date", date)
+		}
+	}
+
+	// Write rerun record AFTER lock is confirmed.
+	if _, err := d.Store.WriteRerun(ctx, pipelineID, schedule, date, reason, ""); err != nil {
+		// Lock acquired but write failed — release lock to avoid deadlock.
+		if relErr := d.Store.ReleaseTriggerLock(ctx, pipelineID, schedule, date); relErr != nil {
+			d.Logger.Warn("failed to release lock after rerun write failure", "error", relErr)
+		}
+		return fmt.Errorf("write rerun for %q: %w", pipelineID, err)
+	}
+
+	// Publish acceptance event only after lock and rerun record confirmed.
 	if err := d.Store.WriteJobEvent(ctx, pipelineID, schedule, date,
 		types.JobEventRerunAccepted, "", 0, ""); err != nil {
 		d.Logger.Warn("failed to write rerun-accepted joblog", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date)
@@ -206,11 +209,10 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date,
 		return nil
 	}
 
-	// Dry-run pipelines never start real executions.
+	// Dry-run pipelines evaluate retry logic but publish observation events
+	// instead of executing side effects.
 	if cfg.DryRun {
-		d.Logger.Info("dry-run: skipping job failure rerun",
-			"pipelineId", pipelineID, "schedule", schedule, "date", date)
-		return nil
+		return handleDryRunJobFailure(ctx, d, cfg, pipelineID, schedule, date)
 	}
 
 	maxRetries := cfg.Job.MaxRetries
@@ -301,6 +303,171 @@ func handleJobFailure(ctx context.Context, d *Deps, pipelineID, schedule, date,
 	return nil
 }
 
+// handleDryRunRerunRequest evaluates all rerun checks (calendar, limit,
+// circuit breaker) and publishes observation events instead of executing
+// side effects. Mirrors the production handleRerunRequest logic.
+func handleDryRunRerunRequest(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string, record events.DynamoDBEventRecord) error {
+	// Calendar exclusion check.
+	if isExcludedDate(cfg, date) {
+		if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date,
+			fmt.Sprintf("dry-run: rerun rejected for %s: execution date %s excluded by calendar", pipelineID, date),
+			map[string]interface{}{
+				"reason": "excluded by calendar",
+			}); pubErr != nil {
+			d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr)
+		}
+		return nil
+	}
+
+	// Extract reason from stream record NewImage. Default to "manual".
+	reason := "manual"
+	if img := record.Change.NewImage; img != nil {
+		if r, ok := img["reason"]; ok && r.DataType() == events.DataTypeString {
+			if v := r.String(); v != "" {
+				reason = v
+			}
+		}
+	}
+
+	// Rerun limit check.
+	var budget int
+	var sources []string
+	switch reason {
+	case "data-drift", "late-data":
+		budget = types.IntOrDefault(cfg.Job.MaxDriftReruns, 1)
+		sources = []string{"data-drift", "late-data"}
+	default:
+		budget = types.IntOrDefault(cfg.Job.MaxManualReruns, 1)
+		sources = []string{reason}
+	}
+
+	count, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, sources)
+	if err != nil {
+		return fmt.Errorf("dry-run: count reruns by source for %q: %w", pipelineID, err)
+	}
+
+	if count >= budget {
+		if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date,
+			fmt.Sprintf("dry-run: rerun rejected for %s: limit exceeded (%d/%d)", pipelineID, count, budget),
+			map[string]interface{}{
+				"reason":     "limit exceeded",
+				"rerunCount": count,
+				"budget":     budget,
+			}); pubErr != nil {
+			d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr)
+		}
+		return nil
+	}
+
+	// Circuit breaker (sensor freshness).
+	cbStatus := "passed"
+	job, err := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date)
+	if err != nil {
+		return fmt.Errorf("dry-run: get latest job event for %q/%s/%s: %w", pipelineID, schedule, date, err)
+	}
+
+	if job == nil {
+		cbStatus = "skipped (no job history)"
+	} else if job.Event == types.JobEventSuccess {
+		fresh, freshErr := checkSensorFreshness(ctx, d, pipelineID, job.SK)
+		if freshErr != nil {
+			return fmt.Errorf("dry-run: check sensor freshness for %q: %w", pipelineID, freshErr)
+		}
+		if !fresh {
+			if pubErr := publishEvent(ctx, d, string(types.EventDryRunRerunRejected), pipelineID, schedule, date,
+				fmt.Sprintf("dry-run: rerun rejected for %s: previous run succeeded and no sensor data has changed", pipelineID),
+				map[string]interface{}{
+					"reason":         "circuit breaker",
+					"circuitBreaker": "rejected",
+				}); pubErr != nil {
+				d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRerunRejected, "error", pubErr)
+			}
+			return nil
+		}
+	}
+
+	// All checks pass — publish would-rerun event.
+	if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRerun), pipelineID, schedule, date,
+		fmt.Sprintf("dry-run: would rerun %s (reason: %s)", pipelineID, reason),
+		map[string]interface{}{
+			"reason":         reason,
+			"circuitBreaker": cbStatus,
+			"rerunCount":     count,
+			"budget":         budget,
+		}); pubErr != nil {
+		d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRerun, "error", pubErr)
+	}
+
+	d.Logger.Info("dry-run: would rerun",
+		"pipelineId", pipelineID, "schedule", schedule, "date", date, "reason", reason)
+	return nil
+}
+
+// handleDryRunJobFailure evaluates retry logic for a dry-run pipeline and
+// publishes observation events instead of executing side effects.
+func handleDryRunJobFailure(ctx context.Context, d *Deps, cfg *types.PipelineConfig, pipelineID, schedule, date string) error {
+	maxRetries := cfg.Job.MaxRetries
+
+	// Read latest job event for failure category (read-only).
+	latestJob, jobErr := d.Store.GetLatestJobEvent(ctx, pipelineID, schedule, date)
+	if jobErr != nil {
+		d.Logger.WarnContext(ctx, "dry-run: could not read latest job event for failure category",
+			"pipelineId", pipelineID, "error", jobErr)
+	}
+	if latestJob != nil {
+		if types.FailureCategory(latestJob.Category) == types.FailurePermanent {
+			maxRetries = types.IntOrDefault(cfg.Job.MaxCodeRetries, 1)
+		}
+		// TRANSIENT, TIMEOUT, or empty → use cfg.Job.MaxRetries (already set).
+	}
+
+	rerunCount, err := d.Store.CountRerunsBySource(ctx, pipelineID, schedule, date, []string{"job-fail-retry"})
+	if err != nil {
+		return fmt.Errorf("dry-run: count reruns for %q/%s/%s: %w", pipelineID, schedule, date, err)
+	}
+
+	if rerunCount >= maxRetries {
+		if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date,
+			fmt.Sprintf("dry-run: retry limit reached (%d/%d) for %s", rerunCount, maxRetries, pipelineID),
+			map[string]interface{}{
+				"retries":    rerunCount,
+				"maxRetries": maxRetries,
+			}); pubErr != nil {
+			d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr)
+		}
+		return nil
+	}
+
+	// Calendar exclusion check.
+	if isExcludedDate(cfg, date) {
+		if pubErr := publishEvent(ctx, d, string(types.EventDryRunRetryExhausted), pipelineID, schedule, date,
+			fmt.Sprintf("dry-run: retry skipped for %s: execution date %s excluded by calendar", pipelineID, date),
+			map[string]interface{}{
+				"reason":     "excluded by calendar",
+				"retries":    rerunCount,
+				"maxRetries": maxRetries,
+			}); pubErr != nil {
+			d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunRetryExhausted, "error", pubErr)
+		}
+		return nil
+	}
+
+	// Under budget — publish would-retry event.
+	if pubErr := publishEvent(ctx, d, string(types.EventDryRunWouldRetry), pipelineID, schedule, date,
+		fmt.Sprintf("dry-run: would retry %s (%d/%d)", pipelineID, rerunCount, maxRetries),
+		map[string]interface{}{
+			"retries":    rerunCount,
+			"maxRetries": maxRetries,
+		}); pubErr != nil {
+		d.Logger.WarnContext(ctx, "failed to publish event", "type", types.EventDryRunWouldRetry, "error", pubErr)
+	}
+
+	d.Logger.Info("dry-run: would retry",
+		"pipelineId", pipelineID, "schedule", schedule, "date", date,
+		"retries", rerunCount, "maxRetries", maxRetries)
+	return nil
+}
+
 // checkSensorFreshness determines whether any sensor data has been updated
 // after the given job completed. The job timestamp is extracted from the job
 // SK (format: JOB#schedule#date#<unixMillis>). Returns true if data has
@@ -350,6 +517,13 @@ func checkSensorFreshness(ctx context.Context, d *Deps, pipelineID, jobSK string
 			continue
 		}
 
+		// Normalize epoch: if ts looks like seconds (< 1e12), convert to millis.
+		// Epoch millis won't be < 1e12 until ~2001, and epoch seconds won't
+		// exceed 1e12 until ~33658 CE.
+		if ts > 0 && ts < 1e12 {
+			ts *= 1000
+		}
+
 		if ts > jobTimestamp {
 			return true, nil // Data changed after job — allow rerun.
 		}
diff --git a/internal/lambda/sla_monitor.go b/internal/lambda/sla_monitor.go
index e4164c8..5093fa8 100644
--- a/internal/lambda/sla_monitor.go
+++ b/internal/lambda/sla_monitor.go
@@ -256,28 +256,8 @@ func handleSLASchedule(ctx context.Context, d *Deps, input SLAMonitorInput) (SLA
 		return calc, nil
 	}
 
-	for _, alert := range []struct {
-		suffix    string
-		alertType string
-		timestamp string
-	}{
-		{"warning", "SLA_WARNING", calc.WarningAt},
-		{"breach", "SLA_BREACH", calc.BreachAt},
-	} {
-		name := slaScheduleName(input.PipelineID, input.ScheduleID, input.Date, alert.suffix)
-		payload := SLAMonitorInput{
-			Mode:       "fire-alert",
-			PipelineID: input.PipelineID,
-			ScheduleID: input.ScheduleID,
-			Date:       input.Date,
-			AlertType:  alert.alertType,
-		}
-		if alert.alertType == "SLA_WARNING" {
-			payload.BreachAt = calc.BreachAt
-		}
-		if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil {
-			return SLAMonitorOutput{}, fmt.Errorf("create %s schedule: %w", alert.suffix, err)
-		}
+	if err := createSLASchedules(ctx, d, input.PipelineID, input.ScheduleID, input.Date, calc, false); err != nil {
+		return SLAMonitorOutput{}, err
 	}
 
 	d.Logger.InfoContext(ctx, "scheduled SLA alerts",
@@ -342,17 +322,31 @@ func handleSLACancel(ctx context.Context, d *Deps, input SLAMonitorInput) (SLAMo
 		}
 	}
 
-	// Always publish the verdict. For first runs, Scheduler entries would have
-	// fired WARNING/BREACH but are now deleted — MET is the only new signal.
-	// For reruns, the Scheduler entries were already deleted by the first run's
-	// cancel, so this publish is the only path to a notification.
+	// Only publish a verdict if the pipeline was actually triggered.
+	// If no trigger record exists, the pipeline never ran — publishing SLA_MET
+	// would be misleading since the SLA wasn't "met" (nothing executed).
+	publish := true
+	if d.Store != nil {
+		tr, err := d.Store.GetTrigger(ctx, input.PipelineID, input.ScheduleID, input.Date)
+		if err != nil {
+			d.Logger.WarnContext(ctx, "trigger lookup failed in cancel, proceeding with verdict",
+				"pipeline", input.PipelineID, "error", err)
+		} else if tr == nil {
+			d.Logger.InfoContext(ctx, "skipping SLA verdict — pipeline was never triggered",
+				"pipeline", input.PipelineID, "date", input.Date, "alertType", alertType)
+			publish = false
+		}
+	}
+
 	d.Logger.InfoContext(ctx, "cancelled SLA schedules",
 		"pipeline", input.PipelineID,
 		"alertType", alertType,
 	)
-	if err := publishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date,
-		fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil {
-		return SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err)
+	if publish {
+		if err := publishEvent(ctx, d, alertType, input.PipelineID, input.ScheduleID, input.Date,
+			fmt.Sprintf("pipeline %s: %s", input.PipelineID, alertType)); err != nil {
+			return SLAMonitorOutput{}, fmt.Errorf("publish SLA cancel verdict: %w", err)
+		}
 	}
 
 	return SLAMonitorOutput{
@@ -400,6 +394,42 @@ func createOneTimeSchedule(ctx context.Context, d *Deps, name, timestamp string,
 	return nil
 }
 
+// createSLASchedules creates warning and breach one-time schedules.
+// Returns an error on the first schedule creation failure. If onConflictSkip
+// is true, ConflictException errors are silently skipped (idempotent retries).
+func createSLASchedules(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, calc SLAMonitorOutput, onConflictSkip bool) error {
+	for _, alert := range []struct {
+		suffix    string
+		alertType string
+		timestamp string
+	}{
+		{"warning", "SLA_WARNING", calc.WarningAt},
+		{"breach", "SLA_BREACH", calc.BreachAt},
+	} {
+		name := slaScheduleName(pipelineID, scheduleID, date, alert.suffix)
+		payload := SLAMonitorInput{
+			Mode:       "fire-alert",
+			PipelineID: pipelineID,
+			ScheduleID: scheduleID,
+			Date:       date,
+			AlertType:  alert.alertType,
+		}
+		if alert.alertType == "SLA_WARNING" {
+			payload.BreachAt = calc.BreachAt
+		}
+		if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil {
+			if onConflictSkip {
+				var conflict *schedulerTypes.ConflictException
+				if errors.As(err, &conflict) {
+					continue
+				}
+			}
+			return fmt.Errorf("create %s schedule: %w", alert.suffix, err)
+		}
+	}
+	return nil
+}
+
 // handleSLAReconcile calculates deadlines and fires any alerts for deadlines
 // that have already passed. Fallback for environments without EventBridge
 // Scheduler configured.
@@ -423,13 +453,17 @@ func handleSLAReconcile(ctx context.Context, d *Deps, input SLAMonitorInput) (SL
 	var alertType string
 	switch {
 	case now.After(breachAt) || now.Equal(breachAt):
-		_ = publishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date,
-			fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail)
+		if err := publishEvent(ctx, d, "SLA_BREACH", input.PipelineID, input.ScheduleID, input.Date,
+			fmt.Sprintf("pipeline %s: SLA_BREACH", input.PipelineID), reconcileDetail); err != nil {
+			d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_BREACH", "error", err)
+		}
 		alertType = "SLA_BREACH"
 	case now.After(warningAt) || now.Equal(warningAt):
 		// Past warning but before breach — fire warning only
-		_ = publishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date,
-			fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail)
+		if err := publishEvent(ctx, d, "SLA_WARNING", input.PipelineID, input.ScheduleID, input.Date,
+			fmt.Sprintf("pipeline %s: SLA_WARNING", input.PipelineID), reconcileDetail); err != nil {
+			d.Logger.WarnContext(ctx, "failed to publish event", "type", "SLA_WARNING", "error", err)
+		}
 		alertType = "SLA_WARNING"
 	default:
 		alertType = "SLA_MET"
diff --git a/internal/lambda/sla_monitor_test.go b/internal/lambda/sla_monitor_test.go
index c8228c1..10c969c 100644
--- a/internal/lambda/sla_monitor_test.go
+++ b/internal/lambda/sla_monitor_test.go
@@ -520,6 +520,46 @@ func TestSLAMonitor_Cancel_RecalculatesWhenTimesNotProvided(t *testing.T) {
 	}
 }
 
+// ---------------------------------------------------------------------------
+// BUG-5 characterization: SLA_MET published when pipeline never ran
+// ---------------------------------------------------------------------------
+
+func TestSLAMonitor_Cancel_NeverTriggered_PublishesMet(t *testing.T) {
+	// BUG-5 characterization: SLA_MET fires even with no trigger/job records.
+	// Pipeline was never started — there should be no SLA verdict at all.
+	sched := &mockScheduler{}
+	eb := &mockEventBridge{}
+	mock := newMockDDB()
+	s := &store.Store{
+		Client:       mock,
+		ControlTable: testControlTable,
+		JobLogTable:  "joblog",
+		RerunTable:   "rerun",
+	}
+	d := &lambda.Deps{
+		Store:              s,
+		Scheduler:          sched,
+		SchedulerGroupName: "interlock-sla",
+		EventBridge:        eb,
+		EventBusName:       "test-bus",
+		Logger:             slog.Default(),
+	}
+
+	// No trigger, no joblog — pipeline was never started
+	out, err := lambda.HandleSLAMonitor(context.Background(), d, lambda.SLAMonitorInput{
+		Mode:       "cancel",
+		PipelineID: "never-ran",
+		ScheduleID: "daily",
+		Date:       "2026-03-13",
+		WarningAt:  "2099-12-31T23:45:00Z",
+		BreachAt:   "2099-12-31T23:59:00Z",
+	})
+	require.NoError(t, err)
+	// BUG-5 fixed: AlertType still set for SFN flow, but no EventBridge event published
+	assert.Equal(t, "SLA_MET", out.AlertType, "AlertType should still be set for SFN state machine")
+	assert.Empty(t, eb.events, "no EventBridge events should be published when pipeline was never triggered")
+}
+
 // ---------------------------------------------------------------------------
 // Fire-alert tests
 // ---------------------------------------------------------------------------
diff --git a/internal/lambda/stream_router.go b/internal/lambda/stream_router.go
index 66bedb5..0344928 100644
--- a/internal/lambda/stream_router.go
+++ b/internal/lambda/stream_router.go
@@ -50,18 +50,23 @@ func getValidatedConfig(ctx context.Context, d *Deps, pipelineID string) (*types
 }
 
 // HandleStreamEvent processes a DynamoDB stream event, routing each record
-// to the appropriate handler based on the SK prefix. Errors are logged but
-// do not fail the batch (returns nil) to prevent infinite retries.
-func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) error {
+// to the appropriate handler based on the SK prefix. Per-record errors are
+// collected as BatchItemFailures so the Lambda runtime can use DynamoDB's
+// ReportBatchItemFailures to retry only the failed records.
+func HandleStreamEvent(ctx context.Context, d *Deps, event StreamEvent) (events.DynamoDBEventResponse, error) {
+	var resp events.DynamoDBEventResponse
 	for i := range event.Records {
 		if err := handleRecord(ctx, d, event.Records[i]); err != nil {
 			d.Logger.Error("stream record error",
 				"error", err,
 				"eventID", event.Records[i].EventID,
 			)
+			resp.BatchItemFailures = append(resp.BatchItemFailures, events.DynamoDBBatchItemFailure{
+				ItemIdentifier: event.Records[i].EventID,
+			})
 		}
 	}
-	return nil
+	return resp, nil
 }
 
 // handleRecord extracts PK/SK and routes to the appropriate handler.
diff --git a/internal/lambda/stream_router_test.go b/internal/lambda/stream_router_test.go
index 09ab196..d1b8508 100644
--- a/internal/lambda/stream_router_test.go
+++ b/internal/lambda/stream_router_test.go
@@ -141,7 +141,7 @@ func TestStreamRouter_SensorMatch_StartsSFN(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -182,7 +182,7 @@ func TestStreamRouter_SensorPrefixMatch_PerPeriodKey(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -207,7 +207,7 @@ func TestStreamRouter_SensorNoMatch_NoSFN(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -231,7 +231,7 @@ func TestStreamRouter_SensorMatch_LockHeld_NoSFN(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -256,7 +256,7 @@ func TestStreamRouter_CalendarExcluded_NoSFN(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -274,7 +274,7 @@ func TestStreamRouter_NoPipelineConfig_NoSFN(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -301,7 +301,7 @@ func TestStreamRouter_ConfigChange_InvalidatesCache(t *testing.T) {
 	sensorRecord := makeSensorRecord("gold-revenue", "upstream-complete", map[string]events.DynamoDBAttributeValue{
 		"status": events.NewStringAttribute("ready"),
 	})
-	err := lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{
+	_, err := lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{
 		Records: []events.DynamoDBEventRecord{sensorRecord},
 	})
 	require.NoError(t, err)
@@ -316,13 +316,13 @@ func TestStreamRouter_ConfigChange_InvalidatesCache(t *testing.T) {
 
 	// Send a CONFIG change event to invalidate the cache.
 	configRecord := makeConfigRecord("gold-revenue")
-	err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{
+	_, err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{
 		Records: []events.DynamoDBEventRecord{configRecord},
 	})
 	require.NoError(t, err)
 
 	// Now send the sensor event again — should trigger SFN with the updated config.
-	err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{
+	_, err = lambda.HandleStreamEvent(context.Background(), d, lambda.StreamEvent{
 		Records: []events.DynamoDBEventRecord{sensorRecord},
 	})
 	require.NoError(t, err)
@@ -401,7 +401,7 @@ func TestStreamRouter_JobFail_UnderRetryLimit_Reruns(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Should have started a new SFN execution for the rerun.
@@ -432,7 +432,7 @@ func TestStreamRouter_JobFail_OverRetryLimit_Alerts(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN execution should be started.
@@ -457,7 +457,7 @@ func TestStreamRouter_JobSuccess_PublishesEvent(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventSuccess)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN execution for success.
@@ -484,7 +484,7 @@ func TestStreamRouter_JobTimeout_TreatedAsFailure(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventTimeout)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -518,7 +518,7 @@ func TestStreamRouter_JobFail_DriftRerunsIgnored(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -534,7 +534,7 @@ func TestStreamRouter_JobFail_NoConfig_Skips(t *testing.T) {
 	record := makeJobRecord("unknown-pipeline", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -559,7 +559,7 @@ func TestStreamRouter_TriggerValueMismatch_NoSFN(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -586,7 +586,7 @@ func TestStreamRouter_SensorMatch_RecordsFirstSensorArrival(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Verify SFN was started (lock acquired).
@@ -628,7 +628,7 @@ func TestStreamRouter_SensorMatch_FirstArrivalIdempotent(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Verify original arrival time is preserved (not overwritten).
@@ -673,7 +673,7 @@ func TestStreamRouter_LateDataArrival_CompletedSuccess(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN execution (lock held).
@@ -717,7 +717,7 @@ func TestStreamRouter_LateDataArrival_WritesRerunRequest(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Should have published LATE_DATA_ARRIVAL event (existing behavior).
@@ -756,7 +756,7 @@ func TestStreamRouter_LateDataArrival_StillRunning_Silent(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -797,7 +797,7 @@ func TestStreamRouter_LateDataArrival_CompletedFailed_Silent(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No late data event — pipeline didn't succeed.
@@ -902,6 +902,27 @@ func seedJobEvent(mock *mockDDB, timestamp, event string) {
 }
 
 // seedSensor inserts a sensor record with a data map into the mock control table.
+// toAttributeValue converts a Go value to a DynamoDB attribute value, supporting
+// nested maps for namespaced baseline format.
+func toAttributeValue(v interface{}) ddbtypes.AttributeValue {
+	switch val := v.(type) {
+	case string:
+		return &ddbtypes.AttributeValueMemberS{Value: val}
+	case float64:
+		return &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%g", val)}
+	case int64:
+		return &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", val)}
+	case map[string]interface{}:
+		nested := make(map[string]ddbtypes.AttributeValue, len(val))
+		for nk, nv := range val {
+			nested[nk] = toAttributeValue(nv)
+		}
+		return &ddbtypes.AttributeValueMemberM{Value: nested}
+	default:
+		return &ddbtypes.AttributeValueMemberS{Value: fmt.Sprintf("%v", val)}
+	}
+}
+
 func seedSensor(mock *mockDDB, pipelineID, sensorKey string, data map[string]interface{}) {
 	item := map[string]ddbtypes.AttributeValue{
 		"PK": &ddbtypes.AttributeValueMemberS{Value: types.PipelinePK(pipelineID)},
@@ -910,14 +931,7 @@ func seedSensor(mock *mockDDB, pipelineID, sensorKey string, data map[string]int
 	if data != nil {
 		dataAV := make(map[string]ddbtypes.AttributeValue, len(data))
 		for k, v := range data {
-			switch val := v.(type) {
-			case string:
-				dataAV[k] = &ddbtypes.AttributeValueMemberS{Value: val}
-			case float64:
-				dataAV[k] = &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%g", val)}
-			case int64:
-				dataAV[k] = &ddbtypes.AttributeValueMemberN{Value: fmt.Sprintf("%d", val)}
-			}
+			dataAV[k] = toAttributeValue(v)
 		}
 		item["data"] = &ddbtypes.AttributeValueMemberM{Value: dataAV}
 	}
@@ -938,7 +952,7 @@ func TestStreamRouter_RerunRequest_FailedJob_Allowed(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Should have started a new SFN execution.
@@ -968,7 +982,7 @@ func TestStreamRouter_RerunRequest_SuccessDataChanged_Allowed(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Data changed — SFN should start.
@@ -984,20 +998,22 @@ func TestStreamRouter_RerunRequest_SuccessDataUnchanged_Rejected(t *testing.T) {
 
 	cfg := testJobConfig()
 	seedConfig(mock, cfg)
+	seedTriggerLock(mock, "gold-revenue", "2026-03-01")
 
-	// Seed a successful job event with timestamp 2000000.
-	seedJobEvent(mock, "2000000", types.JobEventSuccess)
+	// Use millis-range timestamps so epoch normalization (ts < 1e12 → ts*1000)
+	// does not distort the comparison.
+	seedJobEvent(mock, "2000000000000", types.JobEventSuccess)
 
-	// Seed a sensor with updatedAt BEFORE the job timestamp.
+	// Seed a sensor with updatedAt BEFORE the job timestamp (both in millis).
 	seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{
-		"updatedAt": float64(1000000), // older than job timestamp
+		"updatedAt": float64(1000000000000), // older than job timestamp
 		"status":    "ready",
 	})
 
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN execution — data unchanged.
@@ -1026,7 +1042,7 @@ func TestStreamRouter_RerunRequest_InfraExhausted_Allowed(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Should have started a new SFN execution.
@@ -1036,6 +1052,37 @@ func TestStreamRouter_RerunRequest_InfraExhausted_Allowed(t *testing.T) {
 	assert.Contains(t, *sfnMock.executions[0].Name, "manual-rerun")
 }
 
+func TestStreamRouter_RerunRequest_SensorEpochSeconds_Normalized(t *testing.T) {
+	mock := newMockDDB()
+	d, sfnMock, _ := testDeps(mock)
+
+	cfg := testJobConfig()
+	seedConfig(mock, cfg)
+	seedTriggerLock(mock, "gold-revenue", "2026-03-01")
+
+	// Seed a successful job with timestamp in millis: 2000000000000 (year ~2033).
+	seedJobEvent(mock, "2000000000000", types.JobEventSuccess)
+
+	// Seed sensor with updatedAt in SECONDS: 2000000001 (1 second after job).
+	// Without normalization, 2000000001 < 2000000000000 → rejected.
+	// With normalization, 2000000001000 > 2000000000000 → allowed.
+	seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{
+		"updatedAt": float64(2000000001), // seconds epoch
+		"status":    "ready",
+	})
+
+	record := makeDefaultRerunRequestRecord()
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
+
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+	_ = resp
+
+	sfnMock.mu.Lock()
+	defer sfnMock.mu.Unlock()
+	require.Len(t, sfnMock.executions, 1, "sensor with epoch-seconds updatedAt should be normalized and allow rerun")
+}
+
 // ---------------------------------------------------------------------------
 // handleRecord routing: unknown SK prefix
 // ---------------------------------------------------------------------------
@@ -1056,7 +1103,7 @@ func TestStreamRouter_UnknownSKPrefix_Silent(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1072,15 +1119,18 @@ func TestStreamRouter_MissingPKOrSK_LogsError(t *testing.T) {
 	mock := newMockDDB()
 	d, _, _ := testDeps(mock)
 
-	// Record with no keys at all — should log error but HandleStreamEvent returns nil.
+	// Record with no keys at all — handleRecord returns error, collected as batch failure.
 	record := events.DynamoDBEventRecord{
+		EventID:   "missing-keys-1",
 		EventName: "INSERT",
 		Change:    events.DynamoDBStreamRecord{},
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
-	require.NoError(t, err, "HandleStreamEvent always returns nil; errors are logged")
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err, "HandleStreamEvent never returns a top-level error")
+	require.Len(t, resp.BatchItemFailures, 1)
+	assert.Equal(t, "missing-keys-1", resp.BatchItemFailures[0].ItemIdentifier)
 }
 
 // ---------------------------------------------------------------------------
@@ -1105,7 +1155,7 @@ func TestLateData_TriggerNil(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No late data event — GetTrigger returned nil (trigger row doesn't match COMPLETED).
@@ -1139,7 +1189,7 @@ func TestSensor_NoTriggerCondition(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1160,7 +1210,7 @@ func TestSensor_SensorKeyMismatch(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1183,9 +1233,10 @@ func TestSensor_StartSFNError(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	// HandleStreamEvent logs errors but always returns nil.
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
-	require.NoError(t, err, "HandleStreamEvent swallows errors")
+	// Per-record error collected as batch failure.
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1, "SFN error should produce a batch item failure")
 
 	// SFN was called but failed.
 	sfnMock.mu.Lock()
@@ -1209,8 +1260,9 @@ func TestSensor_StartSFNError_ReleasesLock(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
-	require.NoError(t, err, "HandleStreamEvent swallows errors")
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1, "SFN error should produce a batch item failure")
 
 	// The trigger lock must have been released after SFN failure.
 	// Schedule ID for stream-triggered pipelines is "stream".
@@ -1237,7 +1289,7 @@ func TestSensor_PerHour_DateOnly(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1262,7 +1314,7 @@ func TestSensor_PerHour_NoDate(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1290,7 +1342,7 @@ func TestRerun_NoJobRecord_Allowed(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1307,7 +1359,7 @@ func TestRerun_NoConfig_Skips(t *testing.T) {
 	record := makeDefaultRerunRequestRecord() // uses "gold-revenue"
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1337,9 +1389,10 @@ func TestRerun_ParseSKError(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	// Error is logged, HandleStreamEvent returns nil.
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	// Per-record error collected as batch failure.
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1)
 }
 
 func TestRerun_TimeoutJob_Allowed(t *testing.T) {
@@ -1356,7 +1409,7 @@ func TestRerun_TimeoutJob_Allowed(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1379,7 +1432,7 @@ func TestRerun_UnknownJobEvent_Allowed(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1404,7 +1457,7 @@ func TestRerun_StartSFNError(t *testing.T) {
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
 	// Error is logged, HandleStreamEvent still returns nil.
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1430,7 +1483,7 @@ func TestSensorFreshness_NoSensors(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No sensors → can't prove unchanged → allow rerun.
@@ -1458,7 +1511,7 @@ func TestSensorFreshness_NoUpdatedAtField(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No updatedAt → can't prove unchanged → allow rerun.
@@ -1486,7 +1539,7 @@ func TestSensorFreshness_FreshSensor_Float(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1500,19 +1553,21 @@ func TestSensorFreshness_StaleSensor_Float(t *testing.T) {
 
 	cfg := testJobConfig()
 	seedConfig(mock, cfg)
+	seedTriggerLock(mock, "gold-revenue", "2026-03-01")
 
-	// Seed a successful job event with timestamp 2000000.
-	seedJobEvent(mock, "2000000", types.JobEventSuccess)
+	// Use millis-range timestamps so epoch normalization (ts < 1e12 → ts*1000)
+	// does not distort the comparison.
+	seedJobEvent(mock, "2000000000000", types.JobEventSuccess)
 
-	// Seed a sensor with updatedAt as float64 < jobTimestamp.
+	// Seed a sensor with updatedAt as float64 < jobTimestamp (both in millis).
 	seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{
-		"updatedAt": float64(1000000),
+		"updatedAt": float64(1000000000000),
 	})
 
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1544,7 +1599,7 @@ func TestSensorFreshness_FreshSensor_String(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1576,7 +1631,7 @@ func TestSensorFreshness_InvalidJobSK(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Invalid job SK → can't parse timestamp → allow to be safe.
@@ -1608,7 +1663,7 @@ func TestSensorFreshness_InvalidTimestamp(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Non-numeric timestamp → allow to be safe.
@@ -1633,7 +1688,7 @@ func TestJobLog_InfraExhaustedEvent(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventInfraTriggerExhausted)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1656,7 +1711,7 @@ func TestJobLog_OtherEvent(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventRerunAccepted)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1691,9 +1746,10 @@ func TestJobLog_ParseSKError(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	// Error is logged, HandleStreamEvent returns nil.
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	// Per-record error collected as batch failure.
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1)
 }
 
 func TestJobLog_MissingEventAttribute(t *testing.T) {
@@ -1722,7 +1778,7 @@ func TestJobLog_MissingEventAttribute(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Missing event attribute → logged as warning, no action.
@@ -1749,7 +1805,7 @@ func TestJobSuccess_PublishesJobCompleted(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventSuccess)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	ebMock.mu.Lock()
@@ -1777,7 +1833,7 @@ func TestJobFailure_NoConfig(t *testing.T) {
 	record := makeJobRecord("unknown-pipeline", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1811,7 +1867,7 @@ func TestBuildSFNConfig_NoPostRunFields(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1847,7 +1903,7 @@ func TestBuildSFNConfig_CustomTimings(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1879,7 +1935,7 @@ func TestBuildSFNConfig_WithSLA(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1910,7 +1966,7 @@ func TestBuildSFNConfig_JobPollWindowDefault(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1937,7 +1993,7 @@ func TestBuildSFNConfig_JobPollWindowOverride(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -1964,7 +2020,7 @@ func TestBuildSFNConfig_JobPollWindowZeroUsesDefault(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2008,7 +2064,7 @@ func TestExtractSensorData_DataMapUnwrap(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Trigger should fire because "data" map was unwrapped, exposing "status" = "ready".
@@ -2030,7 +2086,7 @@ func TestExtractSensorData_NoDataMap(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2064,7 +2120,7 @@ func TestExtractSensorData_SkipsPKSKTTL(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// PK, SK, ttl should be stripped; "status" remains for trigger evaluation.
@@ -2089,7 +2145,7 @@ func TestConvertAV_String(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2125,7 +2181,7 @@ func TestConvertAV_Number(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2161,7 +2217,7 @@ func TestConvertAV_Bool(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2198,7 +2254,7 @@ func TestConvertAV_Map(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// The data map gets unwrapped; "status" should be accessible at top level.
@@ -2236,7 +2292,7 @@ func TestConvertAV_List(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2270,7 +2326,7 @@ func TestConvertAV_Null(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2316,7 +2372,7 @@ func TestResolveScheduleID_StreamTriggered(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2341,7 +2397,7 @@ func TestResolveScheduleID_CronTriggered(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2368,12 +2424,13 @@ func TestPublishEvent_EventBridgeError(t *testing.T) {
 	ebMock.err = fmt.Errorf("EventBridge throttled")
 
 	// JobSuccess publishes an event — if EventBridge fails, handleJobSuccess returns error,
-	// but HandleStreamEvent logs it and returns nil.
+	// collected as a batch item failure.
 	record := makeJobRecord("gold-revenue", types.JobEventSuccess)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
-	require.NoError(t, err, "HandleStreamEvent swallows errors")
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1)
 }
 
 func TestPublishEvent_NilEventBridge(t *testing.T) {
@@ -2389,7 +2446,7 @@ func TestPublishEvent_NilEventBridge(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventSuccess)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 }
 
@@ -2406,7 +2463,7 @@ func TestPublishEvent_EmptyEventBusName(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventSuccess)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 }
 
@@ -2429,7 +2486,7 @@ func TestIsExcluded_WeekendExclusion(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2469,7 +2526,7 @@ func TestStreamRouter_MultipleRecords(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record1, record2}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2507,9 +2564,10 @@ func TestJobLog_UnexpectedPKFormat(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	// Error is logged, HandleStreamEvent returns nil.
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	// Per-record error collected as batch failure.
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1)
 }
 
 // ---------------------------------------------------------------------------
@@ -2537,7 +2595,7 @@ func TestRerun_UnexpectedPKFormat(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 }
 
@@ -2560,7 +2618,7 @@ func TestSensor_UnexpectedPKFormat(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 }
 
@@ -2623,7 +2681,7 @@ func TestRerun_DriftLimitExceeded(t *testing.T) {
 	record := makeRerunRequestWithReason("data-drift")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN — drift limit exceeded.
@@ -2651,7 +2709,7 @@ func TestRerun_ManualLimitExceeded(t *testing.T) {
 	record := makeRerunRequestWithReason("manual")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN — manual limit exceeded.
@@ -2681,7 +2739,7 @@ func TestRerun_DriftUnderLimit(t *testing.T) {
 	record := makeRerunRequestWithReason("data-drift")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// SFN should have started — under drift limit.
@@ -2706,7 +2764,7 @@ func TestRerun_LateDataCountsAsDrift(t *testing.T) {
 	record := makeRerunRequestWithReason("data-drift")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -2733,7 +2791,7 @@ func TestRerun_WritesRerunBeforeLockRelease(t *testing.T) {
 	record := makeRerunRequestWithReason("manual")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// SFN should have started.
@@ -2785,7 +2843,7 @@ func TestRerun_DeletesPostrunBaseline(t *testing.T) {
 	record := makeRerunRequestWithReason("manual")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// SFN should have started.
@@ -2820,7 +2878,7 @@ func TestStreamRouter_JobFail_PermanentUsesCodeRetries(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// MaxCodeRetries=0 → immediate FAILED_FINAL, no SFN started
@@ -2854,7 +2912,7 @@ func TestStreamRouter_JobFail_TransientUsesMaxRetries(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// TRANSIENT uses MaxRetries=3, no reruns yet → should retry
@@ -2883,7 +2941,7 @@ func TestStreamRouter_JobFail_EmptyCategoryUsesMaxRetries(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No category → uses MaxRetries=3, no reruns → should retry
@@ -2979,9 +3037,9 @@ func TestPostRunSensor_Completed_DriftDetected(t *testing.T) {
 	seedConfig(mock, cfg)
 	seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted)
 
-	// Seed baseline captured at completion time.
+	// Seed baseline captured at completion time (namespaced by rule key).
 	seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{
-		"sensor_count": float64(100),
+		"audit-result": map[string]interface{}{"sensor_count": float64(100)},
 	})
 
 	// Sensor arrives with different count → drift.
@@ -2992,7 +3050,7 @@ func TestPostRunSensor_Completed_DriftDetected(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Should publish POST_RUN_DRIFT event.
@@ -3023,9 +3081,9 @@ func TestPostRunSensor_Completed_NoDrift_RulesPass(t *testing.T) {
 	seedConfig(mock, cfg)
 	seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted)
 
-	// Baseline with same count as incoming sensor.
+	// Baseline with same count as incoming sensor (namespaced by rule key).
 	seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{
-		"sensor_count": float64(150),
+		"audit-result": map[string]interface{}{"sensor_count": float64(150)},
 	})
 	// Seed the actual sensor so EvaluateRules can find it.
 	seedSensor(mock, "gold-revenue", "audit-result", map[string]interface{}{
@@ -3039,7 +3097,7 @@ func TestPostRunSensor_Completed_NoDrift_RulesPass(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Should publish POST_RUN_PASSED event.
@@ -3063,9 +3121,9 @@ func TestPostRunSensor_Running_InflightDrift(t *testing.T) {
 	seedConfig(mock, cfg)
 	seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusRunning)
 
-	// Baseline from a previous run.
+	// Baseline from a previous run (namespaced by rule key).
 	seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{
-		"sensor_count": float64(100),
+		"audit-result": map[string]interface{}{"sensor_count": float64(100)},
 	})
 
 	record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{
@@ -3075,7 +3133,7 @@ func TestPostRunSensor_Running_InflightDrift(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Should publish informational POST_RUN_DRIFT_INFLIGHT event (no rerun).
@@ -3113,7 +3171,7 @@ func TestPostRunSensor_FailedFinal_Skipped(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No post-run events should be published for FAILED_FINAL trigger.
@@ -3142,7 +3200,7 @@ func TestPostRunSensor_NoTrigger_Skipped(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No events published when no trigger exists.
@@ -3166,7 +3224,7 @@ func TestPostRunSensor_NoPostRunConfig_GoesToTrigger(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 	// No error, just silently ignored.
 }
@@ -3269,7 +3327,7 @@ func TestJobFailure_AtomicLockReset_Success(t *testing.T) {
 	record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// SFN must have started for the rerun.
@@ -3303,7 +3361,7 @@ func TestJobFailure_LockResetFails_NoSFN(t *testing.T) {
 	record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -3326,7 +3384,7 @@ func TestRerunRequest_AtomicLockReset(t *testing.T) {
 	record := makeRerunRequestRecordFull("manual")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// SFN must have started.
@@ -3353,7 +3411,7 @@ func TestRerunRequest_LockResetFails_PublishesInfraFailure(t *testing.T) {
 	record := makeRerunRequestRecordFull("manual")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN execution.
@@ -3395,9 +3453,10 @@ func TestJobFailure_SFNStartFails_ReleasesLock(t *testing.T) {
 	record := makeJobRecordWithScheduleDate(pipeline, types.JobEventFail, schedule, date)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	// HandleStreamEvent swallows per-record errors — the handler returns nil.
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	// Per-record error collected as batch failure.
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1)
 
 	// Trigger lock must be released after SFN failure (so next attempt can acquire it).
 	assert.False(t, triggerLockExists(mock),
@@ -3420,7 +3479,7 @@ func TestRerunRequest_SFNStartFails_ReleasesLock(t *testing.T) {
 	record := makeRerunRequestRecordFull("manual")
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	assert.False(t, triggerLockExists(mock),
@@ -3557,7 +3616,7 @@ func TestRerunRequest_CalendarExclusion(t *testing.T) {
 	record := makeDefaultRerunRequestRecord() // schedule=stream, date=2026-03-01
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -3581,7 +3640,7 @@ func TestRerunRequest_CalendarExclusion_WritesJobEvent(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	mock.mu.Lock()
@@ -3622,7 +3681,7 @@ func TestRerunRequest_WeekendExclusion(t *testing.T) {
 	}
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -3646,7 +3705,7 @@ func TestJobFailure_CalendarExclusion(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -3673,7 +3732,7 @@ func TestJobFailure_CalendarExclusion_RetryLimitBeatsExclusion(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -3697,7 +3756,7 @@ func TestPostRunDrift_CalendarExclusion(t *testing.T) {
 	seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted)
 
 	seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{
-		"sensor_count": float64(100),
+		"audit-result": map[string]interface{}{"sensor_count": float64(100)},
 	})
 
 	record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{
@@ -3707,7 +3766,7 @@ func TestPostRunDrift_CalendarExclusion(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	ebMock.mu.Lock()
@@ -3747,7 +3806,7 @@ func TestPostRunDrift_NotExcluded_WritesRerun(t *testing.T) {
 	seedTriggerWithStatus(mock, "gold-revenue", "2026-03-01", types.TriggerStatusCompleted)
 
 	seedSensor(mock, "gold-revenue", "postrun-baseline#2026-03-01", map[string]interface{}{
-		"sensor_count": float64(100),
+		"audit-result": map[string]interface{}{"sensor_count": float64(100)},
 	})
 
 	record := makeSensorRecord("gold-revenue", "audit-result", map[string]events.DynamoDBAttributeValue{
@@ -3757,7 +3816,7 @@ func TestPostRunDrift_NotExcluded_WritesRerun(t *testing.T) {
 		}),
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	rerunKey := ddbItemKey(testControlTable, types.PipelinePK("gold-revenue"), types.RerunRequestSK("stream", "2026-03-01"))
@@ -3781,7 +3840,7 @@ func TestSensorEvent_CalendarExclusion_PublishesEvent(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -3855,7 +3914,7 @@ func TestHandleSensorEvent_DryRun_WouldTrigger(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// NO SFN execution must be started.
@@ -3895,7 +3954,7 @@ func TestHandleSensorEvent_DryRun_LateData(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// NO SFN execution.
@@ -3933,7 +3992,7 @@ func TestHandleSensorEvent_DryRun_SLAProjection_Met(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -3988,7 +4047,7 @@ func TestHandleSensorEvent_DryRun_SLAProjection_Breach(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -4042,7 +4101,7 @@ func TestHandleSensorEvent_DryRun_ValidationNotReady(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No SFN.
@@ -4093,7 +4152,7 @@ func TestHandleSensorEvent_DryRun_CapturesBaseline(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -4135,7 +4194,7 @@ func TestHandleSensorEvent_DryRun_Completed_NoSLA(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -4188,7 +4247,7 @@ func TestHandleSensorEvent_DryRun_Completed_WithSLA(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	sfnMock.mu.Lock()
@@ -4243,9 +4302,9 @@ func TestDryRunPostRunSensor_DriftDetected(t *testing.T) {
 	// Pre-seed DRY_RUN# marker (would-trigger already happened).
 	seedDryRunMarker(mock, "gold-revenue", "stream", fixedTestDate, "2026-03-11T01:15:00Z")
 
-	// Pre-seed baseline with sensor_count=500.
+	// Pre-seed baseline with sensor_count=500 (namespaced by rule key).
 	seedSensor(mock, "gold-revenue", "postrun-baseline#"+fixedTestDate, map[string]interface{}{
-		"sensor_count": float64(500),
+		"audit-result": map[string]interface{}{"sensor_count": float64(500)},
 	})
 
 	// Sensor arrives for post-run key with sensor_count=520 (drift detected).
@@ -4257,7 +4316,7 @@ func TestDryRunPostRunSensor_DriftDetected(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// DRY_RUN_DRIFT event published.
@@ -4297,9 +4356,9 @@ func TestDryRunPostRunSensor_NoDrift(t *testing.T) {
 	// Pre-seed DRY_RUN# marker.
 	seedDryRunMarker(mock, "gold-revenue", "stream", fixedTestDate, "2026-03-11T01:15:00Z")
 
-	// Baseline with sensor_count=500.
+	// Baseline with sensor_count=500 (namespaced by rule key).
 	seedSensor(mock, "gold-revenue", "postrun-baseline#"+fixedTestDate, map[string]interface{}{
-		"sensor_count": float64(500),
+		"audit-result": map[string]interface{}{"sensor_count": float64(500)},
 	})
 
 	// Sensor arrives with same sensor_count=500 — no drift.
@@ -4311,7 +4370,7 @@ func TestDryRunPostRunSensor_NoDrift(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No events published (no drift).
@@ -4345,7 +4404,7 @@ func TestDryRunPostRunSensor_NoMarker(t *testing.T) {
 	})
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// No events published (no marker means no trigger happened).
@@ -4359,7 +4418,7 @@ func TestDryRunPostRunSensor_NoMarker(t *testing.T) {
 
 func TestRerun_DryRun_SkipsExecution(t *testing.T) {
 	mock := newMockDDB()
-	d, sfnMock, _ := testDeps(mock)
+	d, sfnMock, ebMock := testDeps(mock)
 
 	cfg := testDryRunConfig()
 	seedConfig(mock, cfg)
@@ -4371,23 +4430,185 @@ func TestRerun_DryRun_SkipsExecution(t *testing.T) {
 	record := makeDefaultRerunRequestRecord()
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Dry-run pipeline must NOT start an SFN execution.
 	sfnMock.mu.Lock()
-	defer sfnMock.mu.Unlock()
 	assert.Empty(t, sfnMock.executions, "dry-run pipeline must not start SFN on rerun request")
+	sfnMock.mu.Unlock()
 
-	// No rerun records written (guard fires before any store side effects).
+	// No rerun records written.
 	count, countErr := d.Store.CountRerunsBySource(context.Background(), "gold-revenue", "stream", "2026-03-01", []string{"manual"})
 	require.NoError(t, countErr)
 	assert.Zero(t, count, "dry-run must not write rerun records")
+
+	// Must publish DRY_RUN_WOULD_RERUN with circuitBreaker and budget fields.
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunWouldRerun), "expected DRY_RUN_WOULD_RERUN event")
+
+	ebMock.mu.Lock()
+	defer ebMock.mu.Unlock()
+	for _, input := range ebMock.events {
+		for _, e := range input.Entries {
+			if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRerun) {
+				var detail types.InterlockEvent
+				require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail))
+				assert.Contains(t, detail.Detail, "circuitBreaker")
+				assert.Contains(t, detail.Detail, "budget")
+			}
+		}
+	}
+}
+
+func TestRerun_DryRun_CalendarExcluded(t *testing.T) {
+	mock := newMockDDB()
+	d, sfnMock, ebMock := testDeps(mock)
+
+	cfg := testDryRunConfig()
+	cfg.Schedule.Exclude = &types.ExclusionConfig{Dates: []string{"2026-03-01"}}
+	seedConfig(mock, cfg)
+
+	record := makeDefaultRerunRequestRecord()
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
+
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+
+	sfnMock.mu.Lock()
+	assert.Empty(t, sfnMock.executions, "dry-run must not start SFN")
+	sfnMock.mu.Unlock()
+
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event")
+
+	ebMock.mu.Lock()
+	defer ebMock.mu.Unlock()
+	for _, input := range ebMock.events {
+		for _, e := range input.Entries {
+			if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) {
+				var detail types.InterlockEvent
+				require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail))
+				assert.Equal(t, "excluded by calendar", detail.Detail["reason"])
+			}
+		}
+	}
+}
+
+func TestRerun_DryRun_LimitExceeded(t *testing.T) {
+	mock := newMockDDB()
+	d, sfnMock, ebMock := testDeps(mock)
+
+	cfg := testDryRunConfig()
+	cfg.Job.MaxManualReruns = intPtr(0)
+	seedConfig(mock, cfg)
+
+	record := makeDefaultRerunRequestRecord()
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
+
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+
+	sfnMock.mu.Lock()
+	assert.Empty(t, sfnMock.executions, "dry-run must not start SFN")
+	sfnMock.mu.Unlock()
+
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event")
+
+	ebMock.mu.Lock()
+	defer ebMock.mu.Unlock()
+	for _, input := range ebMock.events {
+		for _, e := range input.Entries {
+			if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) {
+				var detail types.InterlockEvent
+				require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail))
+				assert.Equal(t, "limit exceeded", detail.Detail["reason"])
+			}
+		}
+	}
+}
+
+func TestRerun_DryRun_CircuitBreakerReject(t *testing.T) {
+	mock := newMockDDB()
+	d, sfnMock, ebMock := testDeps(mock)
+
+	cfg := testDryRunConfig()
+	seedConfig(mock, cfg)
+
+	// Seed a successful job with a millis-epoch timestamp.
+	seedJobEvent(mock, "2000000000000", types.JobEventSuccess)
+
+	// Seed sensors with timestamps OLDER than the job — data unchanged.
+	seedSensor(mock, "gold-revenue", "upstream-complete", map[string]interface{}{
+		"status":    "ready",
+		"updatedAt": float64(1000000000000),
+	})
+
+	record := makeDefaultRerunRequestRecord()
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
+
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+
+	sfnMock.mu.Lock()
+	assert.Empty(t, sfnMock.executions, "dry-run must not start SFN")
+	sfnMock.mu.Unlock()
+
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunRerunRejected), "expected DRY_RUN_RERUN_REJECTED event")
+
+	ebMock.mu.Lock()
+	defer ebMock.mu.Unlock()
+	for _, input := range ebMock.events {
+		for _, e := range input.Entries {
+			if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRerunRejected) {
+				var detail types.InterlockEvent
+				require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail))
+				assert.Equal(t, "circuit breaker", detail.Detail["reason"])
+				assert.Equal(t, "rejected", detail.Detail["circuitBreaker"])
+			}
+		}
+	}
+}
+
+func TestRerun_DryRun_NoJobHistory(t *testing.T) {
+	mock := newMockDDB()
+	d, sfnMock, ebMock := testDeps(mock)
+
+	cfg := testDryRunConfig()
+	seedConfig(mock, cfg)
+
+	// No JOB# events seeded — circuit breaker should report "skipped".
+	record := makeDefaultRerunRequestRecord()
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
+
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+
+	sfnMock.mu.Lock()
+	assert.Empty(t, sfnMock.executions, "dry-run must not start SFN")
+	sfnMock.mu.Unlock()
+
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunWouldRerun), "expected DRY_RUN_WOULD_RERUN event")
+
+	ebMock.mu.Lock()
+	defer ebMock.mu.Unlock()
+	for _, input := range ebMock.events {
+		for _, e := range input.Entries {
+			if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRerun) {
+				var detail types.InterlockEvent
+				require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail))
+				assert.Equal(t, "skipped (no job history)", detail.Detail["circuitBreaker"])
+			}
+		}
+	}
 }
 
 func TestJobFailure_DryRun_SkipsRerun(t *testing.T) {
 	mock := newMockDDB()
-	d, sfnMock, _ := testDeps(mock)
+	d, sfnMock, ebMock := testDeps(mock)
 
 	cfg := testDryRunConfig()
 	cfg.Job.MaxRetries = 2
@@ -4398,16 +4619,120 @@ func TestJobFailure_DryRun_SkipsRerun(t *testing.T) {
 	record := makeJobRecord("gold-revenue", types.JobEventFail)
 	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
 
-	err := lambda.HandleStreamEvent(context.Background(), d, event)
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
 	require.NoError(t, err)
 
 	// Dry-run pipeline must NOT start an SFN execution.
 	sfnMock.mu.Lock()
-	defer sfnMock.mu.Unlock()
 	assert.Empty(t, sfnMock.executions, "dry-run pipeline must not start SFN on job failure")
+	sfnMock.mu.Unlock()
 
-	// No rerun records written (guard fires before any store side effects).
+	// No rerun records written.
 	count, countErr := d.Store.CountRerunsBySource(context.Background(), "gold-revenue", "stream", "2026-03-01", []string{"job-fail-retry"})
 	require.NoError(t, countErr)
 	assert.Zero(t, count, "dry-run must not write rerun records on job failure")
+
+	// Must publish DRY_RUN_WOULD_RETRY with retries and maxRetries fields.
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunWouldRetry), "expected DRY_RUN_WOULD_RETRY event")
+
+	ebMock.mu.Lock()
+	defer ebMock.mu.Unlock()
+	for _, input := range ebMock.events {
+		for _, e := range input.Entries {
+			if e.DetailType != nil && *e.DetailType == string(types.EventDryRunWouldRetry) {
+				var detail types.InterlockEvent
+				require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail))
+				assert.Contains(t, detail.Detail, "retries")
+				assert.Contains(t, detail.Detail, "maxRetries")
+			}
+		}
+	}
+}
+
+func TestJobFailure_DryRun_RetryExhausted(t *testing.T) {
+	mock := newMockDDB()
+	d, sfnMock, ebMock := testDeps(mock)
+
+	cfg := testDryRunConfig()
+	cfg.Job.MaxRetries = 0
+	seedConfig(mock, cfg)
+	seedTriggerLock(mock, "gold-revenue", "2026-03-01")
+
+	record := makeJobRecord("gold-revenue", types.JobEventFail)
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
+
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+
+	sfnMock.mu.Lock()
+	assert.Empty(t, sfnMock.executions, "dry-run must not start SFN")
+	sfnMock.mu.Unlock()
+
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunRetryExhausted), "expected DRY_RUN_RETRY_EXHAUSTED event")
+}
+
+func TestJobFailure_DryRun_CalendarExcluded(t *testing.T) {
+	mock := newMockDDB()
+	d, sfnMock, ebMock := testDeps(mock)
+
+	cfg := testDryRunConfig()
+	cfg.Job.MaxRetries = 2
+	cfg.Schedule.Exclude = &types.ExclusionConfig{Dates: []string{"2026-03-01"}}
+	seedConfig(mock, cfg)
+	seedTriggerLock(mock, "gold-revenue", "2026-03-01")
+
+	record := makeJobRecord("gold-revenue", types.JobEventFail)
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{record}}
+
+	_, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+
+	sfnMock.mu.Lock()
+	assert.Empty(t, sfnMock.executions, "dry-run must not start SFN")
+	sfnMock.mu.Unlock()
+
+	evtTypes := gatherEventDetailTypes(ebMock)
+	assert.Contains(t, evtTypes, string(types.EventDryRunRetryExhausted), "expected DRY_RUN_RETRY_EXHAUSTED event")
+
+	ebMock.mu.Lock()
+	defer ebMock.mu.Unlock()
+	for _, input := range ebMock.events {
+		for _, e := range input.Entries {
+			if e.DetailType != nil && *e.DetailType == string(types.EventDryRunRetryExhausted) {
+				var detail types.InterlockEvent
+				require.NoError(t, json.Unmarshal([]byte(*e.Detail), &detail))
+				assert.Equal(t, "excluded by calendar", detail.Detail["reason"])
+			}
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// BatchItemFailures: partial error reporting
+// ---------------------------------------------------------------------------
+
+func TestStreamRouter_BatchItemFailures_PartialError(t *testing.T) {
+	mock := newMockDDB()
+	d, _, _ := testDeps(mock)
+
+	// Build an event with one valid record and one with empty PK (will error).
+	validRecord := makeSensorRecord("gold-revenue", "upstream-complete", map[string]events.DynamoDBAttributeValue{
+		"status": events.NewStringAttribute("ready"),
+	})
+
+	invalidRecord := events.DynamoDBEventRecord{
+		EventID:   "bad-record-123",
+		EventName: "INSERT",
+		Change: events.DynamoDBStreamRecord{
+			Keys: map[string]events.DynamoDBAttributeValue{},
+		},
+	}
+
+	event := lambda.StreamEvent{Records: []events.DynamoDBEventRecord{invalidRecord, validRecord}}
+	resp, err := lambda.HandleStreamEvent(context.Background(), d, event)
+	require.NoError(t, err)
+	require.Len(t, resp.BatchItemFailures, 1)
+	assert.Equal(t, "bad-record-123", resp.BatchItemFailures[0].ItemIdentifier)
 }
diff --git a/internal/lambda/watchdog.go b/internal/lambda/watchdog.go
index f6b65cf..4d0658c 100644
--- a/internal/lambda/watchdog.go
+++ b/internal/lambda/watchdog.go
@@ -1,18 +1,6 @@
 package lambda
 
-import (
-	"context"
-	"errors"
-	"fmt"
-	"strconv"
-	"strings"
-	"time"
-
-	schedulerTypes "github.com/aws/aws-sdk-go-v2/service/scheduler/types"
-
-	"github.com/dwsmith1983/interlock/internal/validation"
-	"github.com/dwsmith1983/interlock/pkg/types"
-)
+import "context"
 
 // HandleWatchdog runs periodic health checks. It detects stale trigger
 // executions (Step Function timeouts) and missed cron schedules. Errors from
@@ -44,1063 +32,3 @@ func HandleWatchdog(ctx context.Context, d *Deps) error {
 	}
 	return nil
 }
-
-// detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and
-// publishes an SFN_TIMEOUT event for any that have exceeded their TTL or the
-// staleTriggerThreshold. Stale triggers are moved to FAILED_FINAL status.
-func detectStaleTriggers(ctx context.Context, d *Deps) error {
-	triggers, err := d.Store.ScanRunningTriggers(ctx)
-	if err != nil {
-		return fmt.Errorf("scan running triggers: %w", err)
-	}
-
-	now := d.now()
-	for _, tr := range triggers {
-		if !isStaleTrigger(tr, now) {
-			continue
-		}
-
-		pipelineID, schedule, date, err := parseTriggerRecord(tr)
-		if err != nil {
-			d.Logger.Warn("skipping unparseable trigger", "pk", tr.PK, "sk", tr.SK, "error", err)
-			continue
-		}
-
-		// Dry-run pipelines should never have TRIGGER# rows, but guard
-		// against stale rows from pre-dry-run migrations or bugs.
-		if cfg, cfgErr := d.ConfigCache.Get(ctx, pipelineID); cfgErr == nil && cfg != nil && cfg.DryRun {
-			continue
-		}
-
-		alertDetail := map[string]interface{}{
-			"source":     "watchdog",
-			"actionHint": "step function exceeded TTL — check SFN execution history",
-		}
-		if tr.TTL > 0 {
-			alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339)
-		}
-		if err := publishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date,
-			fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil {
-			d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date)
-		}
-
-		if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil {
-			d.Logger.Error("failed to set trigger status to FAILED_FINAL",
-				"pipelineId", pipelineID, "schedule", schedule, "date", date, "error", err)
-			continue
-		}
-
-		d.Logger.Info("detected stale trigger",
-			"pipelineId", pipelineID,
-			"schedule", schedule,
-			"date", date,
-		)
-	}
-	return nil
-}
-
-// isStaleTrigger returns true if the trigger's TTL has expired or if the TTL
-// is zero and the trigger has been running longer than staleTriggerThreshold.
-func isStaleTrigger(tr types.ControlRecord, now time.Time) bool {
-	if tr.TTL > 0 {
-		return now.Unix() > tr.TTL
-	}
-	// No TTL set — treat as stale if it has existed for longer than the threshold.
-	// Without a creation timestamp we can't be precise, so we conservatively
-	// consider it stale only when TTL is explicitly expired.
-	return false
-}
-
-// parseTriggerRecord extracts pipeline ID, schedule, and date from a trigger
-// ControlRecord's PK and SK.
-// PK format: PIPELINE#<id>
-// SK format: TRIGGER#<schedule>#<date>
-func parseTriggerRecord(tr types.ControlRecord) (pipelineID, schedule, date string, err error) {
-	const pkPrefix = "PIPELINE#"
-	if !strings.HasPrefix(tr.PK, pkPrefix) {
-		return "", "", "", fmt.Errorf("unexpected PK format: %q", tr.PK)
-	}
-	pipelineID = tr.PK[len(pkPrefix):]
-
-	const skPrefix = "TRIGGER#"
-	trimmed := strings.TrimPrefix(tr.SK, skPrefix)
-	if trimmed == tr.SK {
-		return "", "", "", fmt.Errorf("unexpected SK format: %q", tr.SK)
-	}
-	parts := strings.SplitN(trimmed, "#", 2)
-	if len(parts) != 2 {
-		return "", "", "", fmt.Errorf("invalid TRIGGER SK format: %q", tr.SK)
-	}
-	return pipelineID, parts[0], parts[1], nil
-}
-
-// reconcileSensorTriggers re-evaluates trigger conditions for sensor-triggered
-// pipelines. If a sensor meets the trigger condition but no trigger lock exists,
-// the watchdog acquires the lock, starts the SFN, and publishes TRIGGER_RECOVERED.
-// This self-heals missed triggers caused by silent completion-write failures.
-func reconcileSensorTriggers(ctx context.Context, d *Deps) error {
-	configs, err := d.ConfigCache.GetAll(ctx)
-	if err != nil {
-		return fmt.Errorf("load configs: %w", err)
-	}
-
-	now := d.now()
-
-	for id, cfg := range configs {
-		trigger := cfg.Schedule.Trigger
-		if trigger == nil || cfg.Schedule.Cron != "" {
-			continue
-		}
-
-		// Dry-run pipelines are observation-only — skip reconciliation.
-		if cfg.DryRun {
-			continue
-		}
-
-		if isExcluded(cfg, now) {
-			continue
-		}
-
-		sensors, err := d.Store.GetAllSensors(ctx, id)
-		if err != nil {
-			d.Logger.Error("failed to get sensors for reconciliation",
-				"pipelineId", id, "error", err)
-			continue
-		}
-
-		scheduleID := resolveScheduleID(cfg)
-
-		for sensorKey, sensorData := range sensors {
-			if !strings.HasPrefix(sensorKey, trigger.Key) {
-				continue
-			}
-
-			rule := types.ValidationRule{
-				Key:   trigger.Key,
-				Check: trigger.Check,
-				Field: trigger.Field,
-				Value: trigger.Value,
-			}
-			result := validation.EvaluateRule(rule, sensorData, now)
-			if !result.Passed {
-				continue
-			}
-
-			date := ResolveExecutionDate(sensorData, now)
-
-			found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date)
-			if err != nil {
-				d.Logger.Error("trigger check failed during reconciliation",
-					"pipelineId", id, "date", date, "error", err)
-				continue
-			}
-			if found {
-				continue
-			}
-
-			// Guard against re-triggering completed pipelines whose trigger
-			// record was deleted by DynamoDB TTL. Check the joblog for a
-			// terminal event before acquiring a new lock.
-			if isJobTerminal(ctx, d, id, scheduleID, date) {
-				continue
-			}
-
-			acquired, err := d.Store.AcquireTriggerLock(ctx, id, scheduleID, date, ResolveTriggerLockTTL())
-			if err != nil {
-				d.Logger.Error("lock acquisition failed during reconciliation",
-					"pipelineId", id, "date", date, "error", err)
-				continue
-			}
-			if !acquired {
-				continue
-			}
-
-			if err := startSFN(ctx, d, cfg, id, scheduleID, date); err != nil {
-				if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil {
-					d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr)
-				}
-				d.Logger.Error("SFN start failed during reconciliation",
-					"pipelineId", id, "date", date, "error", err)
-				continue
-			}
-
-			alertDetail := map[string]interface{}{
-				"source":     "reconciliation",
-				"actionHint": "watchdog recovered missed sensor trigger",
-			}
-			if err := publishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date,
-				fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil {
-				d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date)
-			}
-
-			d.Logger.Info("recovered missed trigger",
-				"pipelineId", id,
-				"schedule", scheduleID,
-				"date", date,
-			)
-		}
-	}
-	return nil
-}
-
-// lastCronFire returns the most recent expected fire time for a cron expression.
-// Supports the minute-hour patterns used by this system: "MM * * * *" (hourly)
-// and "MM HH * * *" (daily). Returns zero time for unsupported patterns.
-func lastCronFire(cron string, now time.Time, loc *time.Location) time.Time {
-	fields := strings.Fields(cron)
-	if len(fields) < 5 {
-		return time.Time{}
-	}
-	minute, err := strconv.Atoi(fields[0])
-	if err != nil {
-		return time.Time{}
-	}
-	localNow := now.In(loc)
-
-	if fields[1] == "*" {
-		// Hourly: fires at :MM every hour.
-		candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(),
-			localNow.Hour(), minute, 0, 0, loc)
-		if candidate.After(localNow) {
-			candidate = candidate.Add(-time.Hour)
-		}
-		return candidate
-	}
-
-	hour, err := strconv.Atoi(fields[1])
-	if err != nil {
-		return time.Time{}
-	}
-	// Daily: fires at HH:MM every day.
-	candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(),
-		hour, minute, 0, 0, loc)
-	if candidate.After(localNow) {
-		candidate = candidate.Add(-24 * time.Hour)
-	}
-	return candidate
-}
-
-// detectMissedSchedules checks all cron-scheduled pipelines to see if today's
-// trigger is missing. If a pipeline should have started by now but has no
-// TRIGGER# row, a SCHEDULE_MISSED event is published.
-func detectMissedSchedules(ctx context.Context, d *Deps) error {
-	configs, err := d.ConfigCache.GetAll(ctx)
-	if err != nil {
-		return fmt.Errorf("load configs: %w", err)
-	}
-
-	now := d.now()
-	today := now.Format("2006-01-02")
-
-	for id, cfg := range configs {
-		// Only check cron-scheduled pipelines.
-		if cfg.Schedule.Cron == "" {
-			continue
-		}
-
-		// Dry-run pipelines are observation-only — skip missed schedule detection.
-		if cfg.DryRun {
-			continue
-		}
-
-		// Skip calendar-excluded days.
-		if isExcluded(cfg, now) {
-			continue
-		}
-
-		// Only alert for schedules that should have fired after this Lambda
-		// started. Prevents retroactive alerts after fresh deploys.
-		if !d.StartedAt.IsZero() {
-			loc := resolveTimezone(cfg.Schedule.Timezone)
-			if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) {
-				continue
-			}
-		}
-
-		// Resolve schedule ID for cron pipelines.
-		scheduleID := resolveScheduleID(cfg)
-
-		// Check if any TRIGGER# row exists for today (covers both daily
-		// and per-hour trigger rows, e.g. "2026-03-04" and "2026-03-04T00").
-		found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, today)
-		if err != nil {
-			d.Logger.Error("failed to check trigger for missed schedule",
-				"pipelineId", id, "error", err)
-			continue
-		}
-		if found {
-			continue
-		}
-
-		// Check if we are past the expected start time. If the pipeline
-		// has a schedule time configured, only alert after that time.
-		if cfg.Schedule.Time != "" {
-			loc := resolveTimezone(cfg.Schedule.Timezone)
-			localNow := now.In(loc)
-			expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc)
-			if err == nil && localNow.Before(expectedStart) {
-				continue // not yet past expected start time
-			}
-		}
-
-		alertDetail := map[string]interface{}{
-			"source":     "watchdog",
-			"cron":       cfg.Schedule.Cron,
-			"actionHint": fmt.Sprintf("cron %s expected to fire — no trigger found", cfg.Schedule.Cron),
-		}
-		if cfg.Schedule.Time != "" {
-			alertDetail["expectedTime"] = cfg.Schedule.Time
-		}
-		if err := publishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today,
-			fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil {
-			d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today)
-		}
-
-		d.Logger.Info("detected missed schedule",
-			"pipelineId", id,
-			"schedule", scheduleID,
-			"date", today,
-		)
-	}
-	return nil
-}
-
-// detectMissedInclusionSchedules checks pipelines with inclusion calendar config
-// for missed schedules on irregular dates. For each pipeline with an Include
-// config, it finds all past inclusion dates (capped at maxInclusionLookback)
-// and verifies that a trigger exists for each. If no trigger is found and no
-// dedup marker exists, an IRREGULAR_SCHEDULE_MISSED event is published.
-func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error {
-	configs, err := d.ConfigCache.GetAll(ctx)
-	if err != nil {
-		return fmt.Errorf("load configs: %w", err)
-	}
-
-	now := d.now()
-
-	for id, cfg := range configs {
-		if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 {
-			continue
-		}
-
-		// Dry-run pipelines are observation-only — skip inclusion schedule detection.
-		if cfg.DryRun {
-			continue
-		}
-
-		// Skip calendar-excluded days.
-		if isExcluded(cfg, now) {
-			continue
-		}
-
-		pastDates := PastInclusionDates(cfg.Schedule.Include.Dates, now)
-		if len(pastDates) == 0 {
-			continue
-		}
-
-		scheduleID := resolveScheduleID(cfg)
-
-		// Resolve today in the pipeline's timezone so the grace-period
-		// guard fires correctly when UTC date != pipeline-local date.
-		tzLoc := resolveTimezone(cfg.Schedule.Timezone)
-		today := now.In(tzLoc).Format("2006-01-02")
-
-		for _, date := range pastDates {
-			// If the inclusion date is today and the pipeline has a
-			// Schedule.Time, only alert after that time has passed.
-			// This mirrors the same check in detectMissedSchedules for
-			// cron pipelines to avoid false-positive alerts before the
-			// expected start time. Past dates are not gated because
-			// their Schedule.Time has necessarily already elapsed.
-			if cfg.Schedule.Time != "" && date == today {
-				localNow := now.In(tzLoc)
-				expectedStart, err := time.ParseInLocation("2006-01-02 15:04", date+" "+cfg.Schedule.Time, tzLoc)
-				if err == nil && localNow.Before(expectedStart) {
-					continue // not yet past expected start time
-				}
-			}
-
-			// Check if a trigger exists for this inclusion date.
-			found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date)
-			if err != nil {
-				d.Logger.Error("failed to check trigger for inclusion schedule",
-					"pipelineId", id, "date", date, "error", err)
-				continue
-			}
-			if found {
-				continue
-			}
-
-			// Check dedup marker to avoid re-alerting on subsequent watchdog runs.
-			dedupKey := "irregular-missed-check#" + date
-			dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey)
-			if err != nil {
-				d.Logger.Error("dedup marker lookup failed for inclusion schedule",
-					"pipelineId", id, "date", date, "error", err)
-				continue
-			}
-			if dedupData != nil {
-				continue
-			}
-
-			alertDetail := map[string]interface{}{
-				"source":     "watchdog",
-				"actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date),
-			}
-			if err := publishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date,
-				fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil {
-				d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date)
-			}
-
-			// Write dedup marker.
-			if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{
-				"alerted": "true",
-			}); err != nil {
-				d.Logger.Warn("failed to write inclusion dedup marker", "error", err, "pipeline", id, "date", date)
-			}
-
-			d.Logger.Info("detected missed inclusion schedule",
-				"pipelineId", id,
-				"schedule", scheduleID,
-				"date", date,
-			)
-		}
-	}
-	return nil
-}
-
-// scheduleSLAAlerts proactively creates EventBridge Scheduler entries for all
-// pipelines with SLA configs. This ensures warnings/breaches fire even when
-// pipelines never trigger (data never arrives, sensor fails, etc.).
-// Idempotency: deterministic scheduler names; ConflictException = already exists.
-func scheduleSLAAlerts(ctx context.Context, d *Deps) error {
-	if d.Scheduler == nil {
-		return nil
-	}
-
-	configs, err := d.ConfigCache.GetAll(ctx)
-	if err != nil {
-		return fmt.Errorf("load configs: %w", err)
-	}
-
-	now := d.now()
-
-	for id, cfg := range configs {
-		if cfg.SLA == nil {
-			continue
-		}
-
-		// Dry-run pipelines are observation-only — skip SLA scheduling.
-		if cfg.DryRun {
-			continue
-		}
-
-		if isExcluded(cfg, now) {
-			continue
-		}
-
-		scheduleID := resolveScheduleID(cfg)
-		date := resolveWatchdogSLADate(cfg, now)
-
-		// Sensor-triggered daily pipelines run T+1: data for today completes
-		// tomorrow, so the SLA deadline is relative to tomorrow's date.
-		// Only slaDate is shifted; the original date is kept for schedule
-		// naming, trigger lookup, and fire-alert payload so cancellation
-		// stays consistent with the SFN's view of the pipeline.
-		slaDate := date
-		if cfg.Schedule.Cron == "" && !strings.HasPrefix(cfg.SLA.Deadline, ":") {
-			t, err := time.Parse("2006-01-02", date)
-			if err == nil {
-				slaDate = t.AddDate(0, 0, 1).Format("2006-01-02")
-			}
-		}
-
-		// Skip if pipeline already completed or permanently failed for this date.
-		tr, err := d.Store.GetTrigger(ctx, id, scheduleID, date)
-		switch {
-		case err != nil:
-			d.Logger.Warn("trigger lookup failed in SLA scheduling", "pipelineId", id, "error", err)
-			continue
-		case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal):
-			continue
-		case isJobTerminal(ctx, d, id, scheduleID, date):
-			continue
-		}
-
-		calc, err := handleSLACalculate(SLAMonitorInput{
-			Mode:             "calculate",
-			PipelineID:       id,
-			ScheduleID:       scheduleID,
-			Date:             slaDate,
-			Deadline:         cfg.SLA.Deadline,
-			ExpectedDuration: cfg.SLA.ExpectedDuration,
-			Timezone:         cfg.SLA.Timezone,
-		}, now)
-		if err != nil {
-			d.Logger.Error("SLA calculate failed", "pipelineId", id, "error", err)
-			continue
-		}
-
-		breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt)
-		if breachAt.IsZero() || breachAt.After(now) {
-			// SLA breach is in the future — create schedules.
-			var scheduleErr bool
-			for _, alert := range []struct {
-				suffix    string
-				alertType string
-				timestamp string
-			}{
-				{"warning", "SLA_WARNING", calc.WarningAt},
-				{"breach", "SLA_BREACH", calc.BreachAt},
-			} {
-				name := slaScheduleName(id, scheduleID, date, alert.suffix)
-				payload := SLAMonitorInput{
-					Mode:       "fire-alert",
-					PipelineID: id,
-					ScheduleID: scheduleID,
-					Date:       date,
-					AlertType:  alert.alertType,
-				}
-				if alert.alertType == "SLA_WARNING" {
-					payload.BreachAt = calc.BreachAt
-				}
-				if err := createOneTimeSchedule(ctx, d, name, alert.timestamp, payload); err != nil {
-					var conflict *schedulerTypes.ConflictException
-					if errors.As(err, &conflict) {
-						continue
-					}
-					d.Logger.Error("create SLA schedule failed",
-						"pipelineId", id, "suffix", alert.suffix, "error", err)
-					scheduleErr = true
-				}
-			}
-
-			if !scheduleErr {
-				d.Logger.Info("proactive SLA schedules ensured",
-					"pipelineId", id,
-					"date", date,
-					"warningAt", calc.WarningAt,
-					"breachAt", calc.BreachAt,
-				)
-			}
-		}
-	}
-	return nil
-}
-
-// checkTriggerDeadlines evaluates trigger deadlines independently of SLA
-// configuration. Pipelines with a Trigger.Deadline but no SLA config are
-// checked here. For each pipeline, if the trigger deadline has passed and
-// no trigger exists, the sensor trigger window is closed.
-func checkTriggerDeadlines(ctx context.Context, d *Deps) error {
-	configs, err := d.ConfigCache.GetAll(ctx)
-	if err != nil {
-		return fmt.Errorf("load configs: %w", err)
-	}
-
-	now := d.now()
-
-	for id, cfg := range configs {
-		if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" {
-			continue
-		}
-
-		// Dry-run pipelines are observation-only — skip trigger deadline checks.
-		if cfg.DryRun {
-			continue
-		}
-
-		if isExcluded(cfg, now) {
-			continue
-		}
-
-		scheduleID := resolveScheduleID(cfg)
-		triggerDate := resolveTriggerDeadlineDate(cfg, now)
-
-		triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate)
-		if err != nil {
-			d.Logger.Warn("trigger lookup failed in deadline check", "pipelineId", id, "error", err)
-			continue
-		}
-		if triggerRec != nil {
-			continue
-		}
-
-		if isJobTerminal(ctx, d, id, scheduleID, triggerDate) {
-			continue
-		}
-
-		closeSensorTriggerWindow(ctx, d, id, scheduleID, triggerDate, cfg, now)
-	}
-	return nil
-}
-
-// resolveWatchdogSLADate determines the execution date for SLA scheduling.
-//   - Hourly pipelines (relative deadline like ":30"): previous hour composite
-//     date, e.g. "2026-03-05T13" when the clock is 14:xx.
-//   - Daily pipelines (absolute deadline like "02:00"): today's date,
-//     so handleSLACalculate rolls the deadline forward to the next occurrence.
-func resolveWatchdogSLADate(cfg *types.PipelineConfig, now time.Time) string {
-	if strings.HasPrefix(cfg.SLA.Deadline, ":") {
-		prev := now.Add(-time.Hour)
-		return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour())
-	}
-	return now.Format("2006-01-02")
-}
-
-// resolveTriggerDeadlineDate determines the execution date for trigger
-// deadline evaluation. Uses the trigger deadline format (not SLA deadline)
-// to decide between hourly composite date and daily date.
-func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string {
-	if strings.HasPrefix(cfg.Schedule.Trigger.Deadline, ":") {
-		prev := now.Add(-time.Hour)
-		return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour())
-	}
-	return now.Format("2006-01-02")
-}
-
-// resolveTriggerDeadlineTime computes the absolute time when the trigger
-// window closes for the given deadline string and execution date.
-//
-// For relative (hourly) deadlines like ":45" with composite date "2026-03-09T13":
-//   - Data for hour 13 is processed in hour 14
-//   - The deadline resolves to 2026-03-09T14:45:00 in the configured timezone
-//
-// For absolute (daily) deadlines like "09:00" with date "2026-03-09":
-//   - The deadline resolves to 2026-03-09T09:00:00 in the configured timezone
-//
-// Unlike handleSLACalculate, this does NOT roll forward when the time is past.
-// Returns zero time on parse errors.
-func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time {
-	loc := resolveTimezone(timezone)
-
-	if strings.HasPrefix(deadline, ":") {
-		// Relative (hourly): ":MM" — deadline is in the NEXT hour after the
-		// composite date's hour, since data for hour H is processed in hour H+1.
-		minute, err := strconv.Atoi(strings.TrimPrefix(deadline, ":"))
-		if err != nil {
-			return time.Time{}
-		}
-		// Parse composite date "YYYY-MM-DDThh".
-		if len(date) < 13 || date[10] != 'T' {
-			return time.Time{}
-		}
-		t, err := time.ParseInLocation("2006-01-02T15", date, loc)
-		if err != nil {
-			return time.Time{}
-		}
-		// Add 1 hour for the processing window, then set the minute.
-		return time.Date(t.Year(), t.Month(), t.Day(), t.Hour()+1, minute, 0, 0, loc)
-	}
-
-	// Absolute (daily): "HH:MM".
-	parts := strings.SplitN(deadline, ":", 2)
-	if len(parts) != 2 {
-		return time.Time{}
-	}
-	hour, err := strconv.Atoi(parts[0])
-	if err != nil {
-		return time.Time{}
-	}
-	minute, err := strconv.Atoi(parts[1])
-	if err != nil {
-		return time.Time{}
-	}
-	t, err := time.ParseInLocation("2006-01-02", date, loc)
-	if err != nil {
-		return time.Time{}
-	}
-	return time.Date(t.Year(), t.Month(), t.Day(), hour, minute, 0, 0, loc)
-}
-
-// closeSensorTriggerWindow checks whether the trigger deadline has passed for
-// a sensor-triggered pipeline that never started. If expired, it writes a
-// FAILED_FINAL trigger record (blocking future auto-triggers) and publishes
-// a SENSOR_DEADLINE_EXPIRED event. A human can still restart via RERUN_REQUEST.
-func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, cfg *types.PipelineConfig, now time.Time) {
-	// Compute the absolute trigger deadline time directly — we do NOT use
-	// handleSLACalculate here because it rolls daily deadlines forward 24h
-	// when past, which defeats the purpose of checking for expiry.
-	tz := ""
-	if cfg.SLA != nil {
-		tz = cfg.SLA.Timezone
-	}
-	triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz)
-	if triggerDeadline.IsZero() || triggerDeadline.After(now) {
-		return
-	}
-
-	// Use conditional put to avoid overwriting a trigger that was acquired
-	// between the GetTrigger read and this write (TOCTOU protection).
-	created, err := d.Store.CreateTriggerIfAbsent(ctx, pipelineID, scheduleID, date, types.TriggerStatusFailedFinal)
-	if err != nil {
-		d.Logger.Error("failed to write FAILED_FINAL for expired trigger deadline",
-			"pipelineId", pipelineID, "schedule", scheduleID, "date", date, "error", err)
-		return
-	}
-	if !created {
-		// Trigger row appeared since the read — pipeline started, don't interfere.
-		d.Logger.Info("trigger appeared during deadline check, skipping window close",
-			"pipelineId", pipelineID, "schedule", scheduleID, "date", date)
-		return
-	}
-
-	alertDetail := map[string]interface{}{
-		"source":          "watchdog",
-		"triggerDeadline": cfg.Schedule.Trigger.Deadline,
-		"actionHint":      "auto-trigger window closed — use RERUN_REQUEST to restart",
-	}
-	if err := publishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date,
-		fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil {
-		d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID)
-	}
-
-	d.Logger.Info("sensor trigger window closed",
-		"pipelineId", pipelineID,
-		"schedule", scheduleID,
-		"date", date,
-		"triggerDeadline", cfg.Schedule.Trigger.Deadline,
-	)
-}
-
-// defaultSensorTimeout is the default grace period for post-run sensors to
-// arrive after a pipeline completes. If no SensorTimeout is configured in
-// PostRunConfig, this value is used.
-const defaultSensorTimeout = 2 * time.Hour
-
-// detectMissingPostRunSensors checks pipelines with PostRun config for missing
-// post-run sensor data. If a pipeline completed (COMPLETED trigger + baseline
-// exists) but no post-run sensor matching a rule key has been updated since
-// completion, and the SensorTimeout grace period has elapsed, a
-// POST_RUN_SENSOR_MISSING event is published.
-func detectMissingPostRunSensors(ctx context.Context, d *Deps) error {
-	configs, err := d.ConfigCache.GetAll(ctx)
-	if err != nil {
-		return fmt.Errorf("load configs: %w", err)
-	}
-
-	now := d.now()
-	today := now.Format("2006-01-02")
-
-	for id, cfg := range configs {
-		if cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 {
-			continue
-		}
-
-		// Dry-run pipelines are observation-only — skip post-run sensor checks.
-		if cfg.DryRun {
-			continue
-		}
-
-		scheduleID := resolveScheduleID(cfg)
-
-		// Only check pipelines with a COMPLETED trigger for today.
-		tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today)
-		if err != nil {
-			d.Logger.Error("trigger lookup failed in post-run sensor check",
-				"pipelineId", id, "error", err)
-			continue
-		}
-		if tr == nil || tr.Status != types.TriggerStatusCompleted {
-			continue
-		}
-
-		// Baseline must exist — it signals that capturePostRunBaseline ran
-		// at completion time.
-		baselineKey := "postrun-baseline#" + today
-		baseline, err := d.Store.GetSensorData(ctx, id, baselineKey)
-		if err != nil {
-			d.Logger.Error("baseline lookup failed in post-run sensor check",
-				"pipelineId", id, "error", err)
-			continue
-		}
-		if baseline == nil {
-			continue
-		}
-
-		// Dedup: skip if we already published an alert for this date.
-		dedupKey := "postrun-check#" + today
-		dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey)
-		if err != nil {
-			d.Logger.Error("dedup marker lookup failed in post-run sensor check",
-				"pipelineId", id, "error", err)
-			continue
-		}
-		if dedupData != nil {
-			continue
-		}
-
-		// Determine the completion timestamp from the latest success job event.
-		completionTime, err := resolveCompletionTime(ctx, d, id, scheduleID, today)
-		if err != nil {
-			d.Logger.Error("completion time resolution failed",
-				"pipelineId", id, "error", err)
-			continue
-		}
-		if completionTime.IsZero() {
-			continue
-		}
-
-		// Parse SensorTimeout from config (default 2h).
-		timeout := parseSensorTimeout(cfg.PostRun.SensorTimeout)
-
-		// Check if the timeout has elapsed since completion.
-		if now.Before(completionTime.Add(timeout)) {
-			continue
-		}
-
-		// Check if any post-run rule sensor has been updated since completion.
-		sensors, err := d.Store.GetAllSensors(ctx, id)
-		if err != nil {
-			d.Logger.Error("sensor lookup failed in post-run sensor check",
-				"pipelineId", id, "error", err)
-			continue
-		}
-
-		if hasPostRunSensorUpdate(cfg.PostRun.Rules, sensors, completionTime) {
-			continue
-		}
-
-		// No post-run sensor has arrived within the grace period — publish event.
-		ruleKeys := make([]string, 0, len(cfg.PostRun.Rules))
-		for _, r := range cfg.PostRun.Rules {
-			ruleKeys = append(ruleKeys, r.Key)
-		}
-
-		alertDetail := map[string]interface{}{
-			"source":        "watchdog",
-			"sensorTimeout": cfg.PostRun.SensorTimeout,
-			"ruleKeys":      strings.Join(ruleKeys, ", "),
-			"actionHint":    "post-run sensor data has not arrived within the expected timeout",
-		}
-		if err := publishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today,
-			fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil {
-			d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today)
-		}
-
-		// Write dedup marker to avoid re-alerting on subsequent watchdog runs.
-		if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{
-			"alerted": "true",
-		}); err != nil {
-			d.Logger.Warn("failed to write post-run dedup marker", "error", err, "pipeline", id, "date", today)
-		}
-
-		d.Logger.Info("detected missing post-run sensor",
-			"pipelineId", id,
-			"schedule", scheduleID,
-			"date", today,
-		)
-	}
-	return nil
-}
-
-// resolveCompletionTime extracts the completion timestamp from the latest
-// success job event for the given pipeline/schedule/date. The job event SK
-// has the format JOB#<schedule>#<date>#<timestamp> where timestamp is
-// milliseconds since epoch.
-func resolveCompletionTime(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) (time.Time, error) {
-	rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date)
-	if err != nil {
-		return time.Time{}, fmt.Errorf("get latest job event: %w", err)
-	}
-	if rec == nil {
-		return time.Time{}, nil
-	}
-	if rec.Event != types.JobEventSuccess {
-		return time.Time{}, nil
-	}
-
-	// Extract timestamp from SK: JOB#<schedule>#<date>#<timestamp>
-	parts := strings.Split(rec.SK, "#")
-	if len(parts) < 4 {
-		return time.Time{}, fmt.Errorf("unexpected job SK format: %q", rec.SK)
-	}
-	tsMillis, err := strconv.ParseInt(parts[len(parts)-1], 10, 64)
-	if err != nil {
-		return time.Time{}, fmt.Errorf("parse job timestamp %q: %w", parts[len(parts)-1], err)
-	}
-	return time.UnixMilli(tsMillis), nil
-}
-
-// parseSensorTimeout parses a duration string from PostRunConfig.SensorTimeout.
-// Returns defaultSensorTimeout (2h) if the string is empty or unparseable.
-func parseSensorTimeout(s string) time.Duration {
-	if s == "" {
-		return defaultSensorTimeout
-	}
-	d, err := time.ParseDuration(s)
-	if err != nil {
-		return defaultSensorTimeout
-	}
-	return d
-}
-
-// hasPostRunSensorUpdate checks whether any sensor matching a PostRun rule key
-// has an updatedAt timestamp newer than the given completion time.
-func hasPostRunSensorUpdate(rules []types.ValidationRule, sensors map[string]map[string]interface{}, completionTime time.Time) bool {
-	completionMillis := completionTime.UnixMilli()
-
-	for _, rule := range rules {
-		data, ok := sensors[rule.Key]
-		if !ok {
-			continue
-		}
-
-		updatedAt, ok := data["updatedAt"]
-		if !ok {
-			continue
-		}
-
-		var ts int64
-		switch v := updatedAt.(type) {
-		case float64:
-			ts = int64(v)
-		case int64:
-			ts = v
-		case string:
-			ts, _ = strconv.ParseInt(v, 10, 64)
-		default:
-			continue
-		}
-
-		if ts > completionMillis {
-			return true
-		}
-	}
-	return false
-}
-
-// detectRelativeSLABreaches checks pipelines with MaxDuration SLA config for
-// breaches. This is a defense-in-depth fallback: if the EventBridge Scheduler
-// fails to fire the relative SLA breach alert, the watchdog catches it.
-//
-// Both today and yesterday are checked because stream_router writes the
-// first-sensor-arrival key using ResolveExecutionDate(), which for T+1
-// sensor-triggered pipelines produces yesterday's date. Checking both dates
-// covers the cross-day boundary.
-func detectRelativeSLABreaches(ctx context.Context, d *Deps) error {
-	configs, err := d.ConfigCache.GetAll(ctx)
-	if err != nil {
-		return fmt.Errorf("load configs: %w", err)
-	}
-
-	now := d.now()
-	datesToCheck := []string{
-		now.Format("2006-01-02"),
-		now.AddDate(0, 0, -1).Format("2006-01-02"),
-	}
-
-	for id, cfg := range configs {
-		if cfg.SLA == nil || cfg.SLA.MaxDuration == "" {
-			continue
-		}
-
-		// Dry-run pipelines are observation-only — skip relative SLA checks.
-		if cfg.DryRun {
-			continue
-		}
-
-		maxDur, err := time.ParseDuration(cfg.SLA.MaxDuration)
-		if err != nil {
-			d.Logger.Warn("invalid maxDuration in SLA config",
-				"pipelineId", id, "maxDuration", cfg.SLA.MaxDuration, "error", err)
-			continue
-		}
-
-		scheduleID := resolveScheduleID(cfg)
-
-		for _, checkDate := range datesToCheck {
-			checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now)
-		}
-	}
-	return nil
-}
-
-// checkRelativeSLAForDate checks a single date for a relative SLA breach on
-// the given pipeline. It looks up the first-sensor-arrival marker, verifies
-// the breach window has elapsed, and publishes an alert if needed.
-func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types.PipelineConfig, scheduleID, checkDate string, maxDur time.Duration, now time.Time) {
-	arrivalKey := "first-sensor-arrival#" + checkDate
-	arrivalData, err := d.Store.GetSensorData(ctx, id, arrivalKey)
-	if err != nil {
-		d.Logger.Error("first-sensor-arrival lookup failed",
-			"pipelineId", id, "date", checkDate, "error", err)
-		return
-	}
-	if arrivalData == nil {
-		return
-	}
-
-	arrivedAtStr, ok := arrivalData["arrivedAt"].(string)
-	if !ok || arrivedAtStr == "" {
-		return
-	}
-	arrivedAt, err := time.Parse(time.RFC3339, arrivedAtStr)
-	if err != nil {
-		d.Logger.Warn("invalid arrivedAt in first-sensor-arrival",
-			"pipelineId", id, "arrivedAt", arrivedAtStr, "error", err)
-		return
-	}
-
-	// Check if the relative SLA has been breached.
-	breachAt := arrivedAt.Add(maxDur)
-	if now.Before(breachAt) {
-		return
-	}
-
-	// Skip if pipeline already completed or permanently failed.
-	tr, err := d.Store.GetTrigger(ctx, id, scheduleID, checkDate)
-	if err != nil {
-		d.Logger.Warn("trigger lookup failed in relative SLA check",
-			"pipelineId", id, "date", checkDate, "error", err)
-		return
-	}
-	if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) {
-		return
-	}
-	if isJobTerminal(ctx, d, id, scheduleID, checkDate) {
-		return
-	}
-
-	// Check dedup marker to avoid re-alerting on subsequent watchdog runs.
-	// The dedup key includes checkDate to avoid cross-date collisions.
-	dedupKey := "relative-sla-breach-check#" + checkDate
-	dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey)
-	if err != nil {
-		d.Logger.Error("dedup marker lookup failed for relative SLA breach",
-			"pipelineId", id, "date", checkDate, "error", err)
-		return
-	}
-	if dedupData != nil {
-		return
-	}
-
-	alertDetail := map[string]interface{}{
-		"source":          "watchdog",
-		"maxDuration":     cfg.SLA.MaxDuration,
-		"sensorArrivalAt": arrivedAtStr,
-		"breachAt":        breachAt.UTC().Format(time.RFC3339),
-		"actionHint":      "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival",
-	}
-	if err := publishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate,
-		fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil {
-		d.Logger.Warn("failed to publish relative SLA breach event",
-			"error", err, "pipeline", id, "date", checkDate)
-	}
-
-	// Write dedup marker.
-	if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{
-		"alerted": "true",
-	}); err != nil {
-		d.Logger.Warn("failed to write relative SLA breach dedup marker",
-			"error", err, "pipeline", id, "date", checkDate)
-	}
-
-	d.Logger.Info("detected relative SLA breach",
-		"pipelineId", id,
-		"schedule", scheduleID,
-		"date", checkDate,
-		"sensorArrivalAt", arrivedAtStr,
-		"breachAt", breachAt.UTC().Format(time.RFC3339),
-	)
-}
diff --git a/internal/lambda/watchdog_missed.go b/internal/lambda/watchdog_missed.go
new file mode 100644
index 0000000..cd94130
--- /dev/null
+++ b/internal/lambda/watchdog_missed.go
@@ -0,0 +1,237 @@
+package lambda
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/dwsmith1983/interlock/pkg/types"
+)
+
+// lastCronFire returns the most recent expected fire time for a cron expression.
+// Supports the minute-hour patterns used by this system: "MM * * * *" (hourly)
+// and "MM HH * * *" (daily). Returns zero time for unsupported patterns.
+func lastCronFire(cron string, now time.Time, loc *time.Location) time.Time {
+	fields := strings.Fields(cron)
+	if len(fields) < 5 {
+		return time.Time{}
+	}
+	minute, err := strconv.Atoi(fields[0])
+	if err != nil {
+		return time.Time{}
+	}
+	localNow := now.In(loc)
+
+	if fields[1] == "*" {
+		// Hourly: fires at :MM every hour.
+		candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(),
+			localNow.Hour(), minute, 0, 0, loc)
+		if candidate.After(localNow) {
+			candidate = candidate.Add(-time.Hour)
+		}
+		return candidate
+	}
+
+	hour, err := strconv.Atoi(fields[1])
+	if err != nil {
+		return time.Time{}
+	}
+	// Daily: fires at HH:MM every day.
+	candidate := time.Date(localNow.Year(), localNow.Month(), localNow.Day(),
+		hour, minute, 0, 0, loc)
+	if candidate.After(localNow) {
+		candidate = candidate.Add(-24 * time.Hour)
+	}
+	return candidate
+}
+
+// detectMissedSchedules checks all cron-scheduled pipelines to see if today's
+// trigger is missing. If a pipeline should have started by now but has no
+// TRIGGER# row, a SCHEDULE_MISSED event is published.
+func detectMissedSchedules(ctx context.Context, d *Deps) error {
+	configs, err := d.ConfigCache.GetAll(ctx)
+	if err != nil {
+		return fmt.Errorf("load configs: %w", err)
+	}
+
+	now := d.now()
+	today := now.Format("2006-01-02")
+
+	for id, cfg := range configs {
+		// Only check cron-scheduled pipelines.
+		if cfg.Schedule.Cron == "" {
+			continue
+		}
+
+		// Dry-run pipelines are observation-only — skip missed schedule detection.
+		if cfg.DryRun {
+			continue
+		}
+
+		// Skip calendar-excluded days.
+		if isExcluded(cfg, now) {
+			continue
+		}
+
+		// Only alert for schedules that should have fired after this Lambda
+		// started. Prevents retroactive alerts after fresh deploys.
+		if !d.StartedAt.IsZero() {
+			loc := resolveTimezone(cfg.Schedule.Timezone)
+			if lastFire := lastCronFire(cfg.Schedule.Cron, now, loc); !lastFire.IsZero() && lastFire.Before(d.StartedAt) {
+				continue
+			}
+		}
+
+		// Resolve schedule ID for cron pipelines.
+		scheduleID := resolveScheduleID(cfg)
+
+		// Check if any TRIGGER# row exists for today (covers both daily
+		// and per-hour trigger rows, e.g. "2026-03-04" and "2026-03-04T00").
+		found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, today)
+		if err != nil {
+			d.Logger.Error("failed to check trigger for missed schedule",
+				"pipelineId", id, "error", err)
+			continue
+		}
+		if found {
+			continue
+		}
+
+		// Check if we are past the expected start time. If the pipeline
+		// has a schedule time configured, only alert after that time.
+		if cfg.Schedule.Time != "" {
+			loc := resolveTimezone(cfg.Schedule.Timezone)
+			localNow := now.In(loc)
+			expectedStart, err := time.ParseInLocation("2006-01-02 15:04", today+" "+cfg.Schedule.Time, loc)
+			if err == nil && localNow.Before(expectedStart) {
+				continue // not yet past expected start time
+			}
+		}
+
+		alertDetail := map[string]interface{}{
+			"source":     "watchdog",
+			"cron":       cfg.Schedule.Cron,
+			"actionHint": fmt.Sprintf("cron %s expected to fire — no trigger found", cfg.Schedule.Cron),
+		}
+		if cfg.Schedule.Time != "" {
+			alertDetail["expectedTime"] = cfg.Schedule.Time
+		}
+		if err := publishEvent(ctx, d, string(types.EventScheduleMissed), id, scheduleID, today,
+			fmt.Sprintf("missed schedule for %s on %s", id, today), alertDetail); err != nil {
+			d.Logger.Warn("failed to publish missed schedule event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today)
+		}
+
+		d.Logger.Info("detected missed schedule",
+			"pipelineId", id,
+			"schedule", scheduleID,
+			"date", today,
+		)
+	}
+	return nil
+}
+
+// detectMissedInclusionSchedules checks pipelines with inclusion calendar config
+// for missed schedules on irregular dates. For each pipeline with an Include
+// config, it finds all past inclusion dates (capped at maxInclusionLookback)
+// and verifies that a trigger exists for each. If no trigger is found and no
+// dedup marker exists, an IRREGULAR_SCHEDULE_MISSED event is published.
+func detectMissedInclusionSchedules(ctx context.Context, d *Deps) error {
+	configs, err := d.ConfigCache.GetAll(ctx)
+	if err != nil {
+		return fmt.Errorf("load configs: %w", err)
+	}
+
+	now := d.now()
+
+	for id, cfg := range configs {
+		if cfg.Schedule.Include == nil || len(cfg.Schedule.Include.Dates) == 0 {
+			continue
+		}
+
+		// Dry-run pipelines are observation-only — skip inclusion schedule detection.
+		if cfg.DryRun {
+			continue
+		}
+
+		// Skip calendar-excluded days.
+		if isExcluded(cfg, now) {
+			continue
+		}
+
+		pastDates := PastInclusionDates(cfg.Schedule.Include.Dates, now)
+		if len(pastDates) == 0 {
+			continue
+		}
+
+		scheduleID := resolveScheduleID(cfg)
+
+		// Resolve today in the pipeline's timezone so the grace-period
+		// guard fires correctly when UTC date != pipeline-local date.
+		tzLoc := resolveTimezone(cfg.Schedule.Timezone)
+		today := now.In(tzLoc).Format("2006-01-02")
+
+		for _, date := range pastDates {
+			// If the inclusion date is today and the pipeline has a
+			// Schedule.Time, only alert after that time has passed.
+			// This mirrors the same check in detectMissedSchedules for
+			// cron pipelines to avoid false-positive alerts before the
+			// expected start time. Past dates are not gated because
+			// their Schedule.Time has necessarily already elapsed.
+			if cfg.Schedule.Time != "" && date == today {
+				localNow := now.In(tzLoc)
+				expectedStart, err := time.ParseInLocation("2006-01-02 15:04", date+" "+cfg.Schedule.Time, tzLoc)
+				if err == nil && localNow.Before(expectedStart) {
+					continue // not yet past expected start time
+				}
+			}
+
+			// Check if a trigger exists for this inclusion date.
+			found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date)
+			if err != nil {
+				d.Logger.Error("failed to check trigger for inclusion schedule",
+					"pipelineId", id, "date", date, "error", err)
+				continue
+			}
+			if found {
+				continue
+			}
+
+			// Check dedup marker to avoid re-alerting on subsequent watchdog runs.
+			dedupKey := "irregular-missed-check#" + date
+			dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey)
+			if err != nil {
+				d.Logger.Error("dedup marker lookup failed for inclusion schedule",
+					"pipelineId", id, "date", date, "error", err)
+				continue
+			}
+			if dedupData != nil {
+				continue
+			}
+
+			alertDetail := map[string]interface{}{
+				"source":     "watchdog",
+				"actionHint": fmt.Sprintf("inclusion date %s expected to have a trigger — none found", date),
+			}
+			if err := publishEvent(ctx, d, string(types.EventIrregularScheduleMissed), id, scheduleID, date,
+				fmt.Sprintf("missed inclusion schedule for %s on %s", id, date), alertDetail); err != nil {
+				d.Logger.Warn("failed to publish irregular schedule missed event", "error", err, "pipeline", id, "date", date)
+			}
+
+			// Write dedup marker.
+			if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{
+				"alerted": "true",
+			}); err != nil {
+				d.Logger.Warn("failed to write inclusion dedup marker", "error", err, "pipeline", id, "date", date)
+			}
+
+			d.Logger.Info("detected missed inclusion schedule",
+				"pipelineId", id,
+				"schedule", scheduleID,
+				"date", date,
+			)
+		}
+	}
+	return nil
+}
diff --git a/internal/lambda/watchdog_postrun.go b/internal/lambda/watchdog_postrun.go
new file mode 100644
index 0000000..677ac10
--- /dev/null
+++ b/internal/lambda/watchdog_postrun.go
@@ -0,0 +1,353 @@
+package lambda
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/dwsmith1983/interlock/pkg/types"
+)
+
+// defaultSensorTimeout is the default grace period for post-run sensors to
+// arrive after a pipeline completes. If no SensorTimeout is configured in
+// PostRunConfig, this value is used.
+const defaultSensorTimeout = 2 * time.Hour
+
+// detectMissingPostRunSensors checks pipelines with PostRun config for missing
+// post-run sensor data. If a pipeline completed (COMPLETED trigger + baseline
+// exists) but no post-run sensor matching a rule key has been updated since
+// completion, and the SensorTimeout grace period has elapsed, a
+// POST_RUN_SENSOR_MISSING event is published.
+func detectMissingPostRunSensors(ctx context.Context, d *Deps) error {
+	configs, err := d.ConfigCache.GetAll(ctx)
+	if err != nil {
+		return fmt.Errorf("load configs: %w", err)
+	}
+
+	now := d.now()
+	today := now.Format("2006-01-02")
+
+	for id, cfg := range configs {
+		if cfg.PostRun == nil || len(cfg.PostRun.Rules) == 0 {
+			continue
+		}
+
+		// Dry-run pipelines are observation-only — skip post-run sensor checks.
+		if cfg.DryRun {
+			continue
+		}
+
+		scheduleID := resolveScheduleID(cfg)
+
+		// Only check pipelines with a COMPLETED trigger for today.
+		tr, err := d.Store.GetTrigger(ctx, id, scheduleID, today)
+		if err != nil {
+			d.Logger.Error("trigger lookup failed in post-run sensor check",
+				"pipelineId", id, "error", err)
+			continue
+		}
+		if tr == nil || tr.Status != types.TriggerStatusCompleted {
+			continue
+		}
+
+		// Baseline must exist — it signals that capturePostRunBaseline ran
+		// at completion time.
+		baselineKey := "postrun-baseline#" + today
+		baseline, err := d.Store.GetSensorData(ctx, id, baselineKey)
+		if err != nil {
+			d.Logger.Error("baseline lookup failed in post-run sensor check",
+				"pipelineId", id, "error", err)
+			continue
+		}
+		if baseline == nil {
+			continue
+		}
+
+		// Dedup: skip if we already published an alert for this date.
+		dedupKey := "postrun-check#" + today
+		dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey)
+		if err != nil {
+			d.Logger.Error("dedup marker lookup failed in post-run sensor check",
+				"pipelineId", id, "error", err)
+			continue
+		}
+		if dedupData != nil {
+			continue
+		}
+
+		// Determine the completion timestamp from the latest success job event.
+		completionTime, err := resolveCompletionTime(ctx, d, id, scheduleID, today)
+		if err != nil {
+			d.Logger.Error("completion time resolution failed",
+				"pipelineId", id, "error", err)
+			continue
+		}
+		if completionTime.IsZero() {
+			continue
+		}
+
+		// Parse SensorTimeout from config (default 2h).
+		timeout := parseSensorTimeout(cfg.PostRun.SensorTimeout)
+
+		// Check if the timeout has elapsed since completion.
+		if now.Before(completionTime.Add(timeout)) {
+			continue
+		}
+
+		// Check if any post-run rule sensor has been updated since completion.
+		sensors, err := d.Store.GetAllSensors(ctx, id)
+		if err != nil {
+			d.Logger.Error("sensor lookup failed in post-run sensor check",
+				"pipelineId", id, "error", err)
+			continue
+		}
+
+		if hasPostRunSensorUpdate(cfg.PostRun.Rules, sensors, completionTime) {
+			continue
+		}
+
+		// No post-run sensor has arrived within the grace period — publish event.
+		ruleKeys := make([]string, 0, len(cfg.PostRun.Rules))
+		for _, r := range cfg.PostRun.Rules {
+			ruleKeys = append(ruleKeys, r.Key)
+		}
+
+		alertDetail := map[string]interface{}{
+			"source":        "watchdog",
+			"sensorTimeout": cfg.PostRun.SensorTimeout,
+			"ruleKeys":      strings.Join(ruleKeys, ", "),
+			"actionHint":    "post-run sensor data has not arrived within the expected timeout",
+		}
+		if err := publishEvent(ctx, d, string(types.EventPostRunSensorMissing), id, scheduleID, today,
+			fmt.Sprintf("post-run sensor missing for %s on %s", id, today), alertDetail); err != nil {
+			d.Logger.Warn("failed to publish post-run sensor missing event", "error", err, "pipeline", id, "schedule", scheduleID, "date", today)
+		}
+
+		// Write dedup marker to avoid re-alerting on subsequent watchdog runs.
+		if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{
+			"alerted": "true",
+		}); err != nil {
+			d.Logger.Warn("failed to write post-run dedup marker", "error", err, "pipeline", id, "date", today)
+		}
+
+		d.Logger.Info("detected missing post-run sensor",
+			"pipelineId", id,
+			"schedule", scheduleID,
+			"date", today,
+		)
+	}
+	return nil
+}
+
+// resolveCompletionTime extracts the completion timestamp from the latest
+// success job event for the given pipeline/schedule/date. The job event SK
+// has the format JOB#<schedule>#<date>#<timestamp> where timestamp is
+// milliseconds since epoch.
+func resolveCompletionTime(ctx context.Context, d *Deps, pipelineID, scheduleID, date string) (time.Time, error) {
+	rec, err := d.Store.GetLatestJobEvent(ctx, pipelineID, scheduleID, date)
+	if err != nil {
+		return time.Time{}, fmt.Errorf("get latest job event: %w", err)
+	}
+	if rec == nil {
+		return time.Time{}, nil
+	}
+	if rec.Event != types.JobEventSuccess {
+		return time.Time{}, nil
+	}
+
+	// Extract timestamp from SK: JOB#<schedule>#<date>#<timestamp>
+	parts := strings.Split(rec.SK, "#")
+	if len(parts) < 4 {
+		return time.Time{}, fmt.Errorf("unexpected job SK format: %q", rec.SK)
+	}
+	tsMillis, err := strconv.ParseInt(parts[len(parts)-1], 10, 64)
+	if err != nil {
+		return time.Time{}, fmt.Errorf("parse job timestamp %q: %w", parts[len(parts)-1], err)
+	}
+	return time.UnixMilli(tsMillis), nil
+}
+
+// parseSensorTimeout parses a duration string from PostRunConfig.SensorTimeout.
+// Returns defaultSensorTimeout (2h) if the string is empty or unparseable.
+func parseSensorTimeout(s string) time.Duration {
+	if s == "" {
+		return defaultSensorTimeout
+	}
+	d, err := time.ParseDuration(s)
+	if err != nil {
+		return defaultSensorTimeout
+	}
+	return d
+}
+
+// hasPostRunSensorUpdate checks whether any sensor matching a PostRun rule key
+// has an updatedAt timestamp newer than the given completion time.
+func hasPostRunSensorUpdate(rules []types.ValidationRule, sensors map[string]map[string]interface{}, completionTime time.Time) bool {
+	completionMillis := completionTime.UnixMilli()
+
+	for _, rule := range rules {
+		data, ok := sensors[rule.Key]
+		if !ok {
+			continue
+		}
+
+		updatedAt, ok := data["updatedAt"]
+		if !ok {
+			continue
+		}
+
+		var ts int64
+		switch v := updatedAt.(type) {
+		case float64:
+			ts = int64(v)
+		case int64:
+			ts = v
+		case string:
+			ts, _ = strconv.ParseInt(v, 10, 64)
+		default:
+			continue
+		}
+
+		if ts > completionMillis {
+			return true
+		}
+	}
+	return false
+}
+
+// detectRelativeSLABreaches checks pipelines with MaxDuration SLA config for
+// breaches. This is a defense-in-depth fallback: if the EventBridge Scheduler
+// fails to fire the relative SLA breach alert, the watchdog catches it.
+//
+// Both today and yesterday are checked because stream_router writes the
+// first-sensor-arrival key using ResolveExecutionDate(), which for T+1
+// sensor-triggered pipelines produces yesterday's date. Checking both dates
+// covers the cross-day boundary.
+func detectRelativeSLABreaches(ctx context.Context, d *Deps) error {
+	configs, err := d.ConfigCache.GetAll(ctx)
+	if err != nil {
+		return fmt.Errorf("load configs: %w", err)
+	}
+
+	now := d.now()
+	datesToCheck := []string{
+		now.Format("2006-01-02"),
+		now.AddDate(0, 0, -1).Format("2006-01-02"),
+	}
+
+	for id, cfg := range configs {
+		if cfg.SLA == nil || cfg.SLA.MaxDuration == "" {
+			continue
+		}
+
+		// Dry-run pipelines are observation-only — skip relative SLA checks.
+		if cfg.DryRun {
+			continue
+		}
+
+		maxDur, err := time.ParseDuration(cfg.SLA.MaxDuration)
+		if err != nil {
+			d.Logger.Warn("invalid maxDuration in SLA config",
+				"pipelineId", id, "maxDuration", cfg.SLA.MaxDuration, "error", err)
+			continue
+		}
+
+		scheduleID := resolveScheduleID(cfg)
+
+		for _, checkDate := range datesToCheck {
+			checkRelativeSLAForDate(ctx, d, id, cfg, scheduleID, checkDate, maxDur, now)
+		}
+	}
+	return nil
+}
+
+// checkRelativeSLAForDate checks a single date for a relative SLA breach on
+// the given pipeline. It looks up the first-sensor-arrival marker, verifies
+// the breach window has elapsed, and publishes an alert if needed.
+func checkRelativeSLAForDate(ctx context.Context, d *Deps, id string, cfg *types.PipelineConfig, scheduleID, checkDate string, maxDur time.Duration, now time.Time) {
+	arrivalKey := "first-sensor-arrival#" + checkDate
+	arrivalData, err := d.Store.GetSensorData(ctx, id, arrivalKey)
+	if err != nil {
+		d.Logger.Error("first-sensor-arrival lookup failed",
+			"pipelineId", id, "date", checkDate, "error", err)
+		return
+	}
+	if arrivalData == nil {
+		return
+	}
+
+	arrivedAtStr, ok := arrivalData["arrivedAt"].(string)
+	if !ok || arrivedAtStr == "" {
+		return
+	}
+	arrivedAt, err := time.Parse(time.RFC3339, arrivedAtStr)
+	if err != nil {
+		d.Logger.Warn("invalid arrivedAt in first-sensor-arrival",
+			"pipelineId", id, "arrivedAt", arrivedAtStr, "error", err)
+		return
+	}
+
+	// Check if the relative SLA has been breached.
+	breachAt := arrivedAt.Add(maxDur)
+	if now.Before(breachAt) {
+		return
+	}
+
+	// Skip if pipeline already completed or permanently failed.
+	tr, err := d.Store.GetTrigger(ctx, id, scheduleID, checkDate)
+	if err != nil {
+		d.Logger.Warn("trigger lookup failed in relative SLA check",
+			"pipelineId", id, "date", checkDate, "error", err)
+		return
+	}
+	if tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal) {
+		return
+	}
+	if isJobTerminal(ctx, d, id, scheduleID, checkDate) {
+		return
+	}
+
+	// Check dedup marker to avoid re-alerting on subsequent watchdog runs.
+	// The dedup key includes checkDate to avoid cross-date collisions.
+	dedupKey := "relative-sla-breach-check#" + checkDate
+	dedupData, err := d.Store.GetSensorData(ctx, id, dedupKey)
+	if err != nil {
+		d.Logger.Error("dedup marker lookup failed for relative SLA breach",
+			"pipelineId", id, "date", checkDate, "error", err)
+		return
+	}
+	if dedupData != nil {
+		return
+	}
+
+	alertDetail := map[string]interface{}{
+		"source":          "watchdog",
+		"maxDuration":     cfg.SLA.MaxDuration,
+		"sensorArrivalAt": arrivedAtStr,
+		"breachAt":        breachAt.UTC().Format(time.RFC3339),
+		"actionHint":      "relative SLA breached — pipeline has exceeded maxDuration since first sensor arrival",
+	}
+	if err := publishEvent(ctx, d, string(types.EventRelativeSLABreach), id, scheduleID, checkDate,
+		fmt.Sprintf("relative SLA breach for %s on %s", id, checkDate), alertDetail); err != nil {
+		d.Logger.Warn("failed to publish relative SLA breach event",
+			"error", err, "pipeline", id, "date", checkDate)
+	}
+
+	// Write dedup marker.
+	if err := d.Store.WriteSensor(ctx, id, dedupKey, map[string]interface{}{
+		"alerted": "true",
+	}); err != nil {
+		d.Logger.Warn("failed to write relative SLA breach dedup marker",
+			"error", err, "pipeline", id, "date", checkDate)
+	}
+
+	d.Logger.Info("detected relative SLA breach",
+		"pipelineId", id,
+		"schedule", scheduleID,
+		"date", checkDate,
+		"sensorArrivalAt", arrivedAtStr,
+		"breachAt", breachAt.UTC().Format(time.RFC3339),
+	)
+}
diff --git a/internal/lambda/watchdog_sla.go b/internal/lambda/watchdog_sla.go
new file mode 100644
index 0000000..7eab64e
--- /dev/null
+++ b/internal/lambda/watchdog_sla.go
@@ -0,0 +1,281 @@
+package lambda
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/dwsmith1983/interlock/pkg/types"
+)
+
+// scheduleSLAAlerts proactively creates EventBridge Scheduler entries for all
+// pipelines with SLA configs. This ensures warnings/breaches fire even when
+// pipelines never trigger (data never arrives, sensor fails, etc.).
+// Idempotency: deterministic scheduler names; ConflictException = already exists.
+func scheduleSLAAlerts(ctx context.Context, d *Deps) error {
+	if d.Scheduler == nil {
+		return nil
+	}
+
+	configs, err := d.ConfigCache.GetAll(ctx)
+	if err != nil {
+		return fmt.Errorf("load configs: %w", err)
+	}
+
+	now := d.now()
+
+	for id, cfg := range configs {
+		if cfg.SLA == nil {
+			continue
+		}
+
+		// Dry-run pipelines are observation-only — skip SLA scheduling.
+		if cfg.DryRun {
+			continue
+		}
+
+		if isExcluded(cfg, now) {
+			continue
+		}
+
+		scheduleID := resolveScheduleID(cfg)
+		date := resolveWatchdogSLADate(cfg, now)
+
+		// Sensor-triggered daily pipelines run T+1: data for today completes
+		// tomorrow, so the SLA deadline is relative to tomorrow's date.
+		// Only slaDate is shifted; the original date is kept for schedule
+		// naming, trigger lookup, and fire-alert payload so cancellation
+		// stays consistent with the SFN's view of the pipeline.
+		slaDate := date
+		if cfg.Schedule.Cron == "" && !strings.HasPrefix(cfg.SLA.Deadline, ":") {
+			t, err := time.Parse("2006-01-02", date)
+			if err == nil {
+				slaDate = t.AddDate(0, 0, 1).Format("2006-01-02")
+			}
+		}
+
+		// Skip if pipeline already completed or permanently failed for this date.
+		tr, err := d.Store.GetTrigger(ctx, id, scheduleID, date)
+		switch {
+		case err != nil:
+			d.Logger.Warn("trigger lookup failed in SLA scheduling", "pipelineId", id, "error", err)
+			continue
+		case tr != nil && (tr.Status == types.TriggerStatusCompleted || tr.Status == types.TriggerStatusFailedFinal):
+			continue
+		case isJobTerminal(ctx, d, id, scheduleID, date):
+			continue
+		}
+
+		calc, err := handleSLACalculate(SLAMonitorInput{
+			Mode:             "calculate",
+			PipelineID:       id,
+			ScheduleID:       scheduleID,
+			Date:             slaDate,
+			Deadline:         cfg.SLA.Deadline,
+			ExpectedDuration: cfg.SLA.ExpectedDuration,
+			Timezone:         cfg.SLA.Timezone,
+		}, now)
+		if err != nil {
+			d.Logger.Error("SLA calculate failed", "pipelineId", id, "error", err)
+			continue
+		}
+
+		breachAt, _ := time.Parse(time.RFC3339, calc.BreachAt)
+		if breachAt.IsZero() || breachAt.After(now) {
+			// SLA breach is in the future — create schedules.
+			scheduleErr := false
+			if err := createSLASchedules(ctx, d, id, scheduleID, date, calc, true); err != nil {
+				d.Logger.Error("create SLA schedule failed",
+					"pipelineId", id, "error", err)
+				scheduleErr = true
+			}
+
+			if !scheduleErr {
+				d.Logger.Info("proactive SLA schedules ensured",
+					"pipelineId", id,
+					"date", date,
+					"warningAt", calc.WarningAt,
+					"breachAt", calc.BreachAt,
+				)
+			}
+		}
+	}
+	return nil
+}
+
+// checkTriggerDeadlines evaluates trigger deadlines independently of SLA
+// configuration. Pipelines with a Trigger.Deadline but no SLA config are
+// checked here. For each pipeline, if the trigger deadline has passed and
+// no trigger exists, the sensor trigger window is closed.
+func checkTriggerDeadlines(ctx context.Context, d *Deps) error {
+	configs, err := d.ConfigCache.GetAll(ctx)
+	if err != nil {
+		return fmt.Errorf("load configs: %w", err)
+	}
+
+	now := d.now()
+
+	for id, cfg := range configs {
+		if cfg.Schedule.Trigger == nil || cfg.Schedule.Trigger.Deadline == "" {
+			continue
+		}
+
+		// Dry-run pipelines are observation-only — skip trigger deadline checks.
+		if cfg.DryRun {
+			continue
+		}
+
+		if isExcluded(cfg, now) {
+			continue
+		}
+
+		scheduleID := resolveScheduleID(cfg)
+		triggerDate := resolveTriggerDeadlineDate(cfg, now)
+
+		triggerRec, err := d.Store.GetTrigger(ctx, id, scheduleID, triggerDate)
+		if err != nil {
+			d.Logger.Warn("trigger lookup failed in deadline check", "pipelineId", id, "error", err)
+			continue
+		}
+		if triggerRec != nil {
+			continue
+		}
+
+		if isJobTerminal(ctx, d, id, scheduleID, triggerDate) {
+			continue
+		}
+
+		closeSensorTriggerWindow(ctx, d, id, scheduleID, triggerDate, cfg, now)
+	}
+	return nil
+}
+
+// resolveWatchdogSLADate determines the execution date for SLA scheduling.
+//   - Hourly pipelines (relative deadline like ":30"): previous hour composite
+//     date, e.g. "2026-03-05T13" when the clock is 14:xx.
+//   - Daily pipelines (absolute deadline like "02:00"): today's date,
+//     so handleSLACalculate rolls the deadline forward to the next occurrence.
+func resolveWatchdogSLADate(cfg *types.PipelineConfig, now time.Time) string {
+	if strings.HasPrefix(cfg.SLA.Deadline, ":") {
+		prev := now.Add(-time.Hour)
+		return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour())
+	}
+	return now.Format("2006-01-02")
+}
+
+// resolveTriggerDeadlineDate determines the execution date for trigger
+// deadline evaluation. Uses the trigger deadline format (not SLA deadline)
+// to decide between hourly composite date and daily date.
+func resolveTriggerDeadlineDate(cfg *types.PipelineConfig, now time.Time) string {
+	if strings.HasPrefix(cfg.Schedule.Trigger.Deadline, ":") {
+		prev := now.Add(-time.Hour)
+		return prev.Format("2006-01-02") + "T" + fmt.Sprintf("%02d", prev.Hour())
+	}
+	return now.Format("2006-01-02")
+}
+
+// resolveTriggerDeadlineTime computes the absolute time when the trigger
+// window closes for the given deadline string and execution date.
+//
+// For relative (hourly) deadlines like ":45" with composite date "2026-03-09T13":
+//   - Data for hour 13 is processed in hour 14
+//   - The deadline resolves to 2026-03-09T14:45:00 in the configured timezone
+//
+// For absolute (daily) deadlines like "09:00" with date "2026-03-09":
+//   - The deadline resolves to 2026-03-09T09:00:00 in the configured timezone
+//
+// Unlike handleSLACalculate, this does NOT roll forward when the time is past.
+// Returns zero time on parse errors.
+func resolveTriggerDeadlineTime(deadline, date, timezone string) time.Time {
+	loc := resolveTimezone(timezone)
+
+	if strings.HasPrefix(deadline, ":") {
+		// Relative (hourly): ":MM" — deadline is in the NEXT hour after the
+		// composite date's hour, since data for hour H is processed in hour H+1.
+		minute, err := strconv.Atoi(strings.TrimPrefix(deadline, ":"))
+		if err != nil {
+			return time.Time{}
+		}
+		// Parse composite date "YYYY-MM-DDThh".
+		if len(date) < 13 || date[10] != 'T' {
+			return time.Time{}
+		}
+		t, err := time.ParseInLocation("2006-01-02T15", date, loc)
+		if err != nil {
+			return time.Time{}
+		}
+		// Add 1 hour for the processing window, then set the minute.
+		return time.Date(t.Year(), t.Month(), t.Day(), t.Hour()+1, minute, 0, 0, loc)
+	}
+
+	// Absolute (daily): "HH:MM".
+	parts := strings.SplitN(deadline, ":", 2)
+	if len(parts) != 2 {
+		return time.Time{}
+	}
+	hour, err := strconv.Atoi(parts[0])
+	if err != nil {
+		return time.Time{}
+	}
+	minute, err := strconv.Atoi(parts[1])
+	if err != nil {
+		return time.Time{}
+	}
+	t, err := time.ParseInLocation("2006-01-02", date, loc)
+	if err != nil {
+		return time.Time{}
+	}
+	return time.Date(t.Year(), t.Month(), t.Day(), hour, minute, 0, 0, loc)
+}
+
+// closeSensorTriggerWindow checks whether the trigger deadline has passed for
+// a sensor-triggered pipeline that never started. If expired, it writes a
+// FAILED_FINAL trigger record (blocking future auto-triggers) and publishes
+// a SENSOR_DEADLINE_EXPIRED event. A human can still restart via RERUN_REQUEST.
+func closeSensorTriggerWindow(ctx context.Context, d *Deps, pipelineID, scheduleID, date string, cfg *types.PipelineConfig, now time.Time) {
+	// Compute the absolute trigger deadline time directly — we do NOT use
+	// handleSLACalculate here because it rolls daily deadlines forward 24h
+	// when past, which defeats the purpose of checking for expiry.
+	tz := cfg.Schedule.Timezone
+	if tz == "" && cfg.SLA != nil {
+		tz = cfg.SLA.Timezone
+	}
+	triggerDeadline := resolveTriggerDeadlineTime(cfg.Schedule.Trigger.Deadline, date, tz)
+	if triggerDeadline.IsZero() || triggerDeadline.After(now) {
+		return
+	}
+
+	// Use conditional put to avoid overwriting a trigger that was acquired
+	// between the GetTrigger read and this write (TOCTOU protection).
+	created, err := d.Store.CreateTriggerIfAbsent(ctx, pipelineID, scheduleID, date, types.TriggerStatusFailedFinal)
+	if err != nil {
+		d.Logger.Error("failed to write FAILED_FINAL for expired trigger deadline",
+			"pipelineId", pipelineID, "schedule", scheduleID, "date", date, "error", err)
+		return
+	}
+	if !created {
+		// Trigger row appeared since the read — pipeline started, don't interfere.
+		d.Logger.Info("trigger appeared during deadline check, skipping window close",
+			"pipelineId", pipelineID, "schedule", scheduleID, "date", date)
+		return
+	}
+
+	alertDetail := map[string]interface{}{
+		"source":          "watchdog",
+		"triggerDeadline": cfg.Schedule.Trigger.Deadline,
+		"actionHint":      "auto-trigger window closed — use RERUN_REQUEST to restart",
+	}
+	if err := publishEvent(ctx, d, string(types.EventSensorDeadlineExpired), pipelineID, scheduleID, date,
+		fmt.Sprintf("trigger deadline expired for %s/%s/%s", pipelineID, scheduleID, date), alertDetail); err != nil {
+		d.Logger.Warn("failed to publish sensor deadline expired event", "error", err, "pipeline", pipelineID)
+	}
+
+	d.Logger.Info("sensor trigger window closed",
+		"pipelineId", pipelineID,
+		"schedule", scheduleID,
+		"date", date,
+		"triggerDeadline", cfg.Schedule.Trigger.Deadline,
+	)
+}
diff --git a/internal/lambda/watchdog_stale.go b/internal/lambda/watchdog_stale.go
new file mode 100644
index 0000000..cebfb57
--- /dev/null
+++ b/internal/lambda/watchdog_stale.go
@@ -0,0 +1,209 @@
+package lambda
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/dwsmith1983/interlock/internal/validation"
+	"github.com/dwsmith1983/interlock/pkg/types"
+)
+
+// detectStaleTriggers scans for TRIGGER# rows with status=RUNNING and
+// publishes an SFN_TIMEOUT event for any that have exceeded their TTL or the
+// staleTriggerThreshold. Stale triggers are moved to FAILED_FINAL status.
+func detectStaleTriggers(ctx context.Context, d *Deps) error {
+	triggers, err := d.Store.ScanRunningTriggers(ctx)
+	if err != nil {
+		return fmt.Errorf("scan running triggers: %w", err)
+	}
+
+	now := d.now()
+	for _, tr := range triggers {
+		if !isStaleTrigger(tr, now) {
+			continue
+		}
+
+		pipelineID, schedule, date, err := parseTriggerRecord(tr)
+		if err != nil {
+			d.Logger.Warn("skipping unparseable trigger", "pk", tr.PK, "sk", tr.SK, "error", err)
+			continue
+		}
+
+		// Dry-run pipelines should never have TRIGGER# rows, but guard
+		// against stale rows from pre-dry-run migrations or bugs.
+		if cfg, cfgErr := d.ConfigCache.Get(ctx, pipelineID); cfgErr == nil && cfg != nil && cfg.DryRun {
+			continue
+		}
+
+		alertDetail := map[string]interface{}{
+			"source":     "watchdog",
+			"actionHint": "step function exceeded TTL — check SFN execution history",
+		}
+		if tr.TTL > 0 {
+			alertDetail["ttlExpired"] = time.Unix(tr.TTL, 0).UTC().Format(time.RFC3339)
+		}
+		if err := publishEvent(ctx, d, string(types.EventSFNTimeout), pipelineID, schedule, date,
+			fmt.Sprintf("step function timed out for %s/%s/%s", pipelineID, schedule, date), alertDetail); err != nil {
+			d.Logger.Warn("failed to publish SFN timeout event", "error", err, "pipeline", pipelineID, "schedule", schedule, "date", date)
+		}
+
+		if err := d.Store.SetTriggerStatus(ctx, pipelineID, schedule, date, types.TriggerStatusFailedFinal); err != nil {
+			d.Logger.Error("failed to set trigger status to FAILED_FINAL",
+				"pipelineId", pipelineID, "schedule", schedule, "date", date, "error", err)
+			continue
+		}
+
+		d.Logger.Info("detected stale trigger",
+			"pipelineId", pipelineID,
+			"schedule", schedule,
+			"date", date,
+		)
+	}
+	return nil
+}
+
+// isStaleTrigger returns true if the trigger's TTL has expired or if the TTL
+// is zero and the trigger has been running longer than staleTriggerThreshold.
+func isStaleTrigger(tr types.ControlRecord, now time.Time) bool {
+	if tr.TTL > 0 {
+		return now.Unix() > tr.TTL
+	}
+	// No TTL set — treat as stale if it has existed for longer than the threshold.
+	// Without a creation timestamp we can't be precise, so we conservatively
+	// consider it stale only when TTL is explicitly expired.
+	return false
+}
+
+// parseTriggerRecord extracts pipeline ID, schedule, and date from a trigger
+// ControlRecord's PK and SK.
+// PK format: PIPELINE#<id>
+// SK format: TRIGGER#<schedule>#<date>
+func parseTriggerRecord(tr types.ControlRecord) (pipelineID, schedule, date string, err error) {
+	const pkPrefix = "PIPELINE#"
+	if !strings.HasPrefix(tr.PK, pkPrefix) {
+		return "", "", "", fmt.Errorf("unexpected PK format: %q", tr.PK)
+	}
+	pipelineID = tr.PK[len(pkPrefix):]
+
+	const skPrefix = "TRIGGER#"
+	trimmed := strings.TrimPrefix(tr.SK, skPrefix)
+	if trimmed == tr.SK {
+		return "", "", "", fmt.Errorf("unexpected SK format: %q", tr.SK)
+	}
+	parts := strings.SplitN(trimmed, "#", 2)
+	if len(parts) != 2 {
+		return "", "", "", fmt.Errorf("invalid TRIGGER SK format: %q", tr.SK)
+	}
+	return pipelineID, parts[0], parts[1], nil
+}
+
+// reconcileSensorTriggers re-evaluates trigger conditions for sensor-triggered
+// pipelines. If a sensor meets the trigger condition but no trigger lock exists,
+// the watchdog acquires the lock, starts the SFN, and publishes TRIGGER_RECOVERED.
+// This self-heals missed triggers caused by silent completion-write failures.
+func reconcileSensorTriggers(ctx context.Context, d *Deps) error {
+	configs, err := d.ConfigCache.GetAll(ctx)
+	if err != nil {
+		return fmt.Errorf("load configs: %w", err)
+	}
+
+	now := d.now()
+
+	for id, cfg := range configs {
+		trigger := cfg.Schedule.Trigger
+		if trigger == nil || cfg.Schedule.Cron != "" {
+			continue
+		}
+
+		// Dry-run pipelines are observation-only — skip reconciliation.
+		if cfg.DryRun {
+			continue
+		}
+
+		if isExcluded(cfg, now) {
+			continue
+		}
+
+		sensors, err := d.Store.GetAllSensors(ctx, id)
+		if err != nil {
+			d.Logger.Error("failed to get sensors for reconciliation",
+				"pipelineId", id, "error", err)
+			continue
+		}
+
+		scheduleID := resolveScheduleID(cfg)
+
+		for sensorKey, sensorData := range sensors {
+			if !strings.HasPrefix(sensorKey, trigger.Key) {
+				continue
+			}
+
+			rule := types.ValidationRule{
+				Key:   trigger.Key,
+				Check: trigger.Check,
+				Field: trigger.Field,
+				Value: trigger.Value,
+			}
+			result := validation.EvaluateRule(rule, sensorData, now)
+			if !result.Passed {
+				continue
+			}
+
+			date := ResolveExecutionDate(sensorData, now)
+
+			found, err := d.Store.HasTriggerForDate(ctx, id, scheduleID, date)
+			if err != nil {
+				d.Logger.Error("trigger check failed during reconciliation",
+					"pipelineId", id, "date", date, "error", err)
+				continue
+			}
+			if found {
+				continue
+			}
+
+			// Guard against re-triggering completed pipelines whose trigger
+			// record was deleted by DynamoDB TTL. Check the joblog for a
+			// terminal event before acquiring a new lock.
+			if isJobTerminal(ctx, d, id, scheduleID, date) {
+				continue
+			}
+
+			acquired, err := d.Store.AcquireTriggerLock(ctx, id, scheduleID, date, ResolveTriggerLockTTL())
+			if err != nil {
+				d.Logger.Error("lock acquisition failed during reconciliation",
+					"pipelineId", id, "date", date, "error", err)
+				continue
+			}
+			if !acquired {
+				continue
+			}
+
+			if err := startSFN(ctx, d, cfg, id, scheduleID, date); err != nil {
+				if relErr := d.Store.ReleaseTriggerLock(ctx, id, scheduleID, date); relErr != nil {
+					d.Logger.Warn("failed to release lock after SFN start failure during reconciliation", "error", relErr)
+				}
+				d.Logger.Error("SFN start failed during reconciliation",
+					"pipelineId", id, "date", date, "error", err)
+				continue
+			}
+
+			alertDetail := map[string]interface{}{
+				"source":     "reconciliation",
+				"actionHint": "watchdog recovered missed sensor trigger",
+			}
+			if err := publishEvent(ctx, d, string(types.EventTriggerRecovered), id, scheduleID, date,
+				fmt.Sprintf("trigger recovered for %s/%s/%s", id, scheduleID, date), alertDetail); err != nil {
+				d.Logger.Warn("failed to publish trigger recovered event", "error", err, "pipeline", id, "schedule", scheduleID, "date", date)
+			}
+
+			d.Logger.Info("recovered missed trigger",
+				"pipelineId", id,
+				"schedule", scheduleID,
+				"date", date,
+			)
+		}
+	}
+	return nil
+}
diff --git a/internal/lambda/watchdog_test.go b/internal/lambda/watchdog_test.go
index baad19c..bc77b82 100644
--- a/internal/lambda/watchdog_test.go
+++ b/internal/lambda/watchdog_test.go
@@ -1875,9 +1875,9 @@ func TestWatchdog_PostRunSensorMissing(t *testing.T) {
 	// Seed COMPLETED trigger for today.
 	seedTriggerWithStatus(mock, "gold-revenue", today, types.TriggerStatusCompleted)
 
-	// Seed baseline (written at completion time).
+	// Seed baseline (written at completion time, namespaced by rule key).
 	seedSensor(mock, "gold-revenue", "postrun-baseline#"+today, map[string]interface{}{
-		"sensor_count": float64(100),
+		"quality-check": map[string]interface{}{"sensor_count": float64(100)},
 	})
 
 	// Seed a success job event with timestamp 3h before now (well past the 1h timeout).
@@ -1925,9 +1925,9 @@ func TestWatchdog_PostRunSensorPresent(t *testing.T) {
 	// Seed COMPLETED trigger for today.
 	seedTriggerWithStatus(mock, "gold-revenue", today, types.TriggerStatusCompleted)
 
-	// Seed baseline.
+	// Seed baseline (namespaced by rule key).
 	seedSensor(mock, "gold-revenue", "postrun-baseline#"+today, map[string]interface{}{
-		"sensor_count": float64(100),
+		"quality-check": map[string]interface{}{"sensor_count": float64(100)},
 	})
 
 	// Seed a success job event 3h before now.
@@ -3189,3 +3189,165 @@ func TestWatchdog_DryRun_SkipsAllSchedulingAndAlerts(t *testing.T) {
 			"dry-run pipeline must not produce %s events", prohibited)
 	}
 }
+
+// TestResolveTriggerDeadlineTime_UsesScheduleTimezone verifies that the trigger
+// deadline is resolved in the schedule's timezone, not the SLA timezone.
+// BUG-6: closeSensorTriggerWindow previously used cfg.SLA.Timezone exclusively,
+// ignoring cfg.Schedule.Timezone. The fix prefers Schedule.Timezone with SLA as
+// fallback.
+func TestResolveTriggerDeadlineTime_UsesScheduleTimezone(t *testing.T) {
+	tests := []struct {
+		name     string
+		deadline string
+		date     string
+		timezone string
+		wantHour int
+		wantMin  int
+		wantTZ   string
+	}{
+		{
+			name:     "daily deadline in US/Eastern",
+			deadline: "09:00",
+			date:     "2026-03-09",
+			timezone: "US/Eastern",
+			wantHour: 9,
+			wantMin:  0,
+			wantTZ:   "EDT",
+		},
+		{
+			name:     "daily deadline in Europe/Berlin",
+			deadline: "09:00",
+			date:     "2026-03-09",
+			timezone: "Europe/Berlin",
+			wantHour: 9,
+			wantMin:  0,
+			wantTZ:   "CET",
+		},
+		{
+			name:     "hourly deadline in Asia/Tokyo",
+			deadline: ":45",
+			date:     "2026-03-09T13",
+			timezone: "Asia/Tokyo",
+			wantHour: 14, // hour+1 for processing window
+			wantMin:  45,
+			wantTZ:   "JST",
+		},
+		{
+			name:     "empty timezone falls back to UTC",
+			deadline: "09:00",
+			date:     "2026-03-09",
+			timezone: "",
+			wantHour: 9,
+			wantMin:  0,
+			wantTZ:   "UTC",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := lambda.ResolveTriggerDeadlineTime(tt.deadline, tt.date, tt.timezone)
+			require.False(t, got.IsZero(), "expected non-zero time")
+			assert.Equal(t, tt.wantHour, got.Hour(), "hour mismatch")
+			assert.Equal(t, tt.wantMin, got.Minute(), "minute mismatch")
+			zoneName, _ := got.Zone()
+			assert.Equal(t, tt.wantTZ, zoneName, "timezone mismatch")
+		})
+	}
+}
+
+// TestCloseSensorTriggerWindow_PrefersScheduleTimezone is an integration-level
+// test verifying that closeSensorTriggerWindow resolves the trigger deadline in
+// the schedule timezone rather than the SLA timezone when both are set.
+//
+// BUG-6: With Schedule.Timezone="US/Eastern" (UTC-4 in March) and
+// SLA.Timezone="Asia/Tokyo" (UTC+9), a 09:00 trigger deadline should resolve
+// to 09:00 US/Eastern (13:00 UTC), NOT 09:00 Asia/Tokyo (00:00 UTC).
+func TestCloseSensorTriggerWindow_PrefersScheduleTimezone(t *testing.T) {
+	mock := newMockDDB()
+	d, _, ebMock := testDeps(mock)
+	schedMock := &mockScheduler{}
+	d.Scheduler = schedMock
+	d.SLAMonitorARN = "arn:aws:lambda:us-east-1:123:function:sla-monitor"
+	d.SchedulerRoleARN = "arn:aws:iam::123:role/scheduler-role"
+	d.SchedulerGroupName = "interlock-sla"
+
+	// Fix time at 13:30 UTC on 2026-03-09. In US/Eastern (EDT, UTC-4),
+	// this is 09:30 — past the 09:00 trigger deadline.
+	// In Asia/Tokyo (JST, UTC+9), 09:00 JST = 00:00 UTC on 2026-03-09,
+	// so 13:30 UTC is also past 09:00 JST.
+	//
+	// The critical test: at 12:30 UTC (08:30 Eastern), the deadline should
+	// NOT have expired in the schedule timezone (US/Eastern), even though it
+	// would have expired if resolved in Asia/Tokyo.
+	beforeDeadlineUTC := time.Date(2026, 3, 9, 12, 30, 0, 0, time.UTC)
+	d.NowFunc = func() time.Time { return beforeDeadlineUTC }
+	d.StartedAt = beforeDeadlineUTC.Add(-5 * time.Minute)
+
+	cfg := types.PipelineConfig{
+		Pipeline: types.PipelineIdentity{ID: "tz-bug6-pipeline"},
+		Schedule: types.ScheduleConfig{
+			Timezone: "US/Eastern", // EDT = UTC-4 in March
+			Trigger: &types.TriggerCondition{
+				Key:      "sensor-data",
+				Check:    "equals",
+				Field:    "ready",
+				Value:    true,
+				Deadline: "09:00", // 09:00 Eastern = 13:00 UTC
+			},
+			Evaluation: types.EvaluationWindow{Window: "1h", Interval: "5m"},
+		},
+		SLA: &types.SLAConfig{
+			Deadline: "10:00",
+			Timezone: "Asia/Tokyo", // JST = UTC+9; 09:00 JST = 00:00 UTC
+		},
+		Validation: types.ValidationConfig{Trigger: "ALL"},
+		Job:        types.JobConfig{Type: "command", Config: map[string]interface{}{"command": "echo hello"}},
+	}
+	seedConfig(mock, cfg)
+
+	err := lambda.HandleWatchdog(context.Background(), d)
+	require.NoError(t, err)
+
+	// At 12:30 UTC = 08:30 Eastern, the 09:00 Eastern deadline has NOT
+	// expired. No SENSOR_DEADLINE_EXPIRED event should be published.
+	// (Under the old buggy code that used SLA.Timezone=Asia/Tokyo,
+	// 09:00 JST = 00:00 UTC, so it would have considered the deadline
+	// expired and published the event.)
+	ebMock.mu.Lock()
+	for _, ev := range ebMock.events {
+		assert.NotEqual(t, string(types.EventSensorDeadlineExpired), *ev.Entries[0].DetailType,
+			"deadline should NOT be expired at 08:30 Eastern (12:30 UTC)")
+	}
+	ebMock.mu.Unlock()
+
+	// Now advance to 13:30 UTC = 09:30 Eastern — past the 09:00 Eastern deadline.
+	afterDeadlineUTC := time.Date(2026, 3, 9, 13, 30, 0, 0, time.UTC)
+	d.NowFunc = func() time.Time { return afterDeadlineUTC }
+	d.StartedAt = afterDeadlineUTC.Add(-5 * time.Minute)
+
+	// Reset mock state for fresh run.
+	mock2 := newMockDDB()
+	d2, _, ebMock2 := testDeps(mock2)
+	d2.Scheduler = &mockScheduler{}
+	d2.SLAMonitorARN = d.SLAMonitorARN
+	d2.SchedulerRoleARN = d.SchedulerRoleARN
+	d2.SchedulerGroupName = d.SchedulerGroupName
+	d2.NowFunc = func() time.Time { return afterDeadlineUTC }
+	d2.StartedAt = afterDeadlineUTC.Add(-5 * time.Minute)
+	seedConfig(mock2, cfg)
+
+	err = lambda.HandleWatchdog(context.Background(), d2)
+	require.NoError(t, err)
+
+	// At 13:30 UTC = 09:30 Eastern, the 09:00 Eastern deadline IS expired.
+	// SENSOR_DEADLINE_EXPIRED should be published.
+	ebMock2.mu.Lock()
+	defer ebMock2.mu.Unlock()
+	var found bool
+	for _, ev := range ebMock2.events {
+		if *ev.Entries[0].DetailType == string(types.EventSensorDeadlineExpired) {
+			found = true
+			break
+		}
+	}
+	assert.True(t, found, "expected SENSOR_DEADLINE_EXPIRED at 09:30 Eastern (13:30 UTC)")
+}
diff --git a/internal/trigger/airflow.go b/internal/trigger/airflow.go
index f78708a..589233e 100644
--- a/internal/trigger/airflow.go
+++ b/internal/trigger/airflow.go
@@ -9,7 +9,6 @@ import (
 	"net/http"
 	"os"
 	"strings"
-	"time"
 
 	"github.com/dwsmith1983/interlock/pkg/types"
 )
@@ -49,15 +48,7 @@ func ExecuteAirflow(ctx context.Context, cfg *types.AirflowTriggerConfig) (map[s
 		req.Header.Set(k, os.Expand(v, safeEnvLookup))
 	}
 
-	client := defaultHTTPClient
-	if cfg.Timeout > 0 {
-		timeout := time.Duration(cfg.Timeout) * time.Second
-		if timeout != defaultTriggerTimeout {
-			client = &http.Client{Timeout: timeout}
-		}
-	}
-
-	resp, err := client.Do(req)
+	resp, err := resolveHTTPClient(cfg.Timeout).Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("airflow trigger: request failed: %w", err)
 	}
diff --git a/internal/trigger/airflow_test.go b/internal/trigger/airflow_test.go
index 35b9cce..69d7651 100644
--- a/internal/trigger/airflow_test.go
+++ b/internal/trigger/airflow_test.go
@@ -26,6 +26,10 @@ func TestExecuteAirflow_Success(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.AirflowTriggerConfig{
 		URL:   srv.URL,
 		DagID: "my_dag",
@@ -49,6 +53,10 @@ func TestExecuteAirflow_AuthHeader(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.AirflowTriggerConfig{
 		URL:     srv.URL,
 		DagID:   "test_dag",
@@ -67,6 +75,10 @@ func TestExecuteAirflow_ServerError(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.AirflowTriggerConfig{
 		URL:   srv.URL,
 		DagID: "my_dag",
@@ -91,6 +103,10 @@ func TestCheckAirflowStatus_Success(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil)
 	require.NoError(t, err)
 	assert.Equal(t, "success", state)
@@ -105,6 +121,10 @@ func TestCheckAirflowStatus_Running(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil)
 	require.NoError(t, err)
 	assert.Equal(t, "running", state)
@@ -119,6 +139,10 @@ func TestCheckAirflowStatus_Failed(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-123", nil)
 	require.NoError(t, err)
 	assert.Equal(t, "failed", state)
@@ -151,6 +175,10 @@ func TestExecuteAirflow_WithBody(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.AirflowTriggerConfig{
 		URL:   srv.URL,
 		DagID: "my_dag",
@@ -183,6 +211,10 @@ func TestExecuteAirflow_MissingDagRunIDInResponse(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.AirflowTriggerConfig{
 		URL:   srv.URL,
 		DagID: "my_dag",
@@ -201,6 +233,10 @@ func TestExecuteAirflow_CustomTimeout(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.AirflowTriggerConfig{
 		URL:     srv.URL,
 		DagID:   "my_dag",
@@ -218,6 +254,10 @@ func TestCheckAirflowStatus_ServerError(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	_, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", nil)
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "status 500")
@@ -232,6 +272,10 @@ func TestCheckAirflowStatus_MissingStateField(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	_, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", nil)
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "response missing state field")
@@ -255,6 +299,10 @@ func TestExecuteAirflow_EnvExpansionRestricted(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.AirflowTriggerConfig{
 		URL:     srv.URL,
 		DagID:   "test_dag",
@@ -287,6 +335,10 @@ func TestCheckAirflowStatus_EnvExpansionRestricted(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	headers := map[string]string{"Authorization": "Bearer ${INTERLOCK_TEST_VAR}/${SECRET_VAR}"}
 	state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", headers)
 	require.NoError(t, err)
@@ -306,6 +358,10 @@ func TestCheckAirflowStatus_WithHeaders(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	state, err := CheckAirflowStatus(context.Background(), srv.URL, "my_dag", "run-1", map[string]string{
 		"Authorization": "Bearer test-token",
 	})
diff --git a/internal/trigger/runner_test.go b/internal/trigger/runner_test.go
index 34b12a4..97c817d 100644
--- a/internal/trigger/runner_test.go
+++ b/internal/trigger/runner_test.go
@@ -144,6 +144,10 @@ func TestRunner_Execute_HTTPType(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	r := NewRunner()
 	_, err := r.Execute(context.Background(), &types.TriggerConfig{
 		Type: types.TriggerHTTP,
@@ -170,6 +174,10 @@ func TestRunner_Execute_AirflowType(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	r := NewRunner()
 	meta, err := r.Execute(context.Background(), &types.TriggerConfig{
 		Type: types.TriggerAirflow,
diff --git a/internal/trigger/ssrf.go b/internal/trigger/ssrf.go
new file mode 100644
index 0000000..164ccb8
--- /dev/null
+++ b/internal/trigger/ssrf.go
@@ -0,0 +1,49 @@
+package trigger
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"syscall"
+	"time"
+)
+
+// newSSRFSafeTransport clones http.DefaultTransport (preserving HTTP/2,
+// keep-alive, idle-conn settings) and replaces the dialer with one whose
+// Control hook rejects connections to private/loopback/link-local IPs.
+func newSSRFSafeTransport() *http.Transport {
+	base := http.DefaultTransport.(*http.Transport).Clone()
+	base.DialContext = (&net.Dialer{
+		Timeout:   base.TLSHandshakeTimeout, // match the original dialer timeout
+		KeepAlive: 30 * time.Second,
+		Control:   ssrfDialControl,
+	}).DialContext
+	return base
+}
+
+func ssrfDialControl(network, address string, _ syscall.RawConn) error {
+	host, _, err := net.SplitHostPort(address)
+	if err != nil {
+		return fmt.Errorf("ssrf: invalid address %q: %w", address, err)
+	}
+	ip := net.ParseIP(host)
+	if ip == nil {
+		return fmt.Errorf("ssrf: could not parse IP %q", host)
+	}
+	if isBlockedIP(ip) {
+		return fmt.Errorf("ssrf: connection to %s blocked (private/loopback/link-local)", ip)
+	}
+	return nil
+}
+
+func isBlockedIP(ip net.IP) bool {
+	return ip.IsLoopback() ||
+		ip.IsPrivate() ||
+		ip.IsLinkLocalUnicast() ||
+		ip.IsMulticast() ||
+		ip.IsUnspecified() ||
+		// Explicit IMDS/ECS checks — already covered by IsLinkLocalUnicast
+		// but kept for visibility since these are the primary SSRF targets.
+		ip.Equal(net.ParseIP("169.254.169.254")) ||
+		ip.Equal(net.ParseIP("169.254.170.2"))
+}
diff --git a/internal/trigger/ssrf_test.go b/internal/trigger/ssrf_test.go
new file mode 100644
index 0000000..3d420b2
--- /dev/null
+++ b/internal/trigger/ssrf_test.go
@@ -0,0 +1,77 @@
+package trigger
+
+import (
+	"net"
+	"testing"
+)
+
+func TestIsBlockedIP(t *testing.T) {
+	blocked := []struct {
+		name string
+		ip   string
+	}{
+		{"loopback_v4", "127.0.0.1"},
+		{"private_10", "10.0.0.1"},
+		{"private_172", "172.16.0.1"},
+		{"private_192", "192.168.1.1"},
+		{"aws_imds", "169.254.169.254"},
+		{"ecs_metadata", "169.254.170.2"},
+		{"loopback_v6", "::1"},
+		{"link_local_v6", "fe80::1"},
+		{"unspecified", "0.0.0.0"},
+	}
+	for _, tc := range blocked {
+		t.Run(tc.name, func(t *testing.T) {
+			ip := net.ParseIP(tc.ip)
+			if ip == nil {
+				t.Fatalf("failed to parse IP %s", tc.ip)
+			}
+			if !isBlockedIP(ip) {
+				t.Errorf("expected %s to be blocked", tc.ip)
+			}
+		})
+	}
+
+	allowed := []struct {
+		name string
+		ip   string
+	}{
+		{"google_dns", "8.8.8.8"},
+		{"aws_public", "52.94.76.1"},
+		{"google_v6", "2607:f8b0:4004:800::200e"},
+	}
+	for _, tc := range allowed {
+		t.Run(tc.name, func(t *testing.T) {
+			ip := net.ParseIP(tc.ip)
+			if ip == nil {
+				t.Fatalf("failed to parse IP %s", tc.ip)
+			}
+			if isBlockedIP(ip) {
+				t.Errorf("expected %s to be allowed", tc.ip)
+			}
+		})
+	}
+}
+
+func TestSSRFDialControl(t *testing.T) {
+	t.Run("blocks_loopback", func(t *testing.T) {
+		err := ssrfDialControl("tcp", "127.0.0.1:80", nil)
+		if err == nil {
+			t.Error("expected error for loopback address")
+		}
+	})
+
+	t.Run("allows_public", func(t *testing.T) {
+		err := ssrfDialControl("tcp", "8.8.8.8:443", nil)
+		if err != nil {
+			t.Errorf("expected no error for public address, got: %v", err)
+		}
+	})
+
+	t.Run("blocks_imds", func(t *testing.T) {
+		err := ssrfDialControl("tcp", "169.254.169.254:80", nil)
+		if err == nil {
+			t.Error("expected error for IMDS address")
+		}
+	})
+}
diff --git a/internal/trigger/trigger.go b/internal/trigger/trigger.go
index 545a70d..cefa6fe 100644
--- a/internal/trigger/trigger.go
+++ b/internal/trigger/trigger.go
@@ -45,7 +45,29 @@ const maxErrorBodyBytes = 512
 const defaultTriggerTimeout = 30 * time.Second
 
 // defaultHTTPClient is shared across HTTP and Airflow triggers to reuse connections.
-var defaultHTTPClient = &http.Client{Timeout: defaultTriggerTimeout}
+// It uses an SSRF-safe transport that rejects private, loopback, and link-local addresses.
+var defaultHTTPClient = &http.Client{
+	Timeout:   defaultTriggerTimeout,
+	Transport: newSSRFSafeTransport(),
+}
+
+// resolveHTTPClient returns a client with the given timeout in seconds. If
+// timeoutSec is zero or matches the default, the shared defaultHTTPClient is
+// returned to reuse connections. When a custom timeout is required, the returned
+// client inherits the transport from defaultHTTPClient so that transport-level
+// settings (including SSRF protection and test overrides) are preserved.
+func resolveHTTPClient(timeoutSec int) *http.Client {
+	if timeoutSec > 0 {
+		timeout := time.Duration(timeoutSec) * time.Second
+		if timeout != defaultTriggerTimeout {
+			return &http.Client{
+				Timeout:   timeout,
+				Transport: defaultHTTPClient.Transport,
+			}
+		}
+	}
+	return defaultHTTPClient
+}
 
 // defaultRunner provides backward-compatible package-level functions.
 var defaultRunner = NewRunner()
@@ -60,13 +82,16 @@ func CheckStatus(ctx context.Context, triggerType types.TriggerType, metadata ma
 	return defaultRunner.CheckStatus(ctx, triggerType, metadata, headers)
 }
 
-// ExecuteCommand runs a shell command trigger.
+// ExecuteCommand runs a command trigger by splitting the command string into
+// arguments and executing the binary directly (no shell). This prevents shell
+// metacharacter injection.
 func ExecuteCommand(ctx context.Context, command string) error {
 	if command == "" {
 		return fmt.Errorf("trigger command is empty")
 	}
 
-	cmd := exec.CommandContext(ctx, "sh", "-c", command)
+	args := strings.Fields(command)
+	cmd := exec.CommandContext(ctx, args[0], args[1:]...)
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
 	return cmd.Run()
@@ -94,14 +119,7 @@ func ExecuteHTTP(ctx context.Context, cfg *types.HTTPTriggerConfig) error {
 		req.Header.Set(k, os.Expand(v, safeEnvLookup))
 	}
 
-	client := defaultHTTPClient
-	if cfg.Timeout > 0 {
-		timeout := time.Duration(cfg.Timeout) * time.Second
-		if timeout != defaultTriggerTimeout {
-			client = &http.Client{Timeout: timeout}
-		}
-	}
-	resp, err := client.Do(req)
+	resp, err := resolveHTTPClient(cfg.Timeout).Do(req)
 	if err != nil {
 		return fmt.Errorf("trigger request failed: %w", err)
 	}
diff --git a/internal/trigger/trigger_test.go b/internal/trigger/trigger_test.go
index 9b1066d..84efb65 100644
--- a/internal/trigger/trigger_test.go
+++ b/internal/trigger/trigger_test.go
@@ -38,6 +38,10 @@ func TestExecuteHTTP_Success(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.TriggerConfig{
 		Type: types.TriggerHTTP,
 		HTTP: &types.HTTPTriggerConfig{Method: "POST", URL: srv.URL},
@@ -82,6 +86,10 @@ func TestExecuteHTTP_ErrorBodyTruncated(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.TriggerConfig{
 		Type: types.TriggerHTTP,
 		HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL},
@@ -110,6 +118,10 @@ func TestExecuteHTTP_ErrorBodySanitized(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.TriggerConfig{
 		Type: types.TriggerHTTP,
 		HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL},
@@ -154,6 +166,10 @@ func TestExecuteHTTP_EnvExpansionRestricted(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.TriggerConfig{
 		Type: types.TriggerHTTP,
 		HTTP: &types.HTTPTriggerConfig{
@@ -229,6 +245,10 @@ func TestExecuteHTTP_Returns_TriggerError_On4xx(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.TriggerConfig{
 		Type: types.TriggerHTTP,
 		HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL},
@@ -249,6 +269,10 @@ func TestExecuteHTTP_Returns_TriggerError_On5xx(t *testing.T) {
 	}))
 	defer srv.Close()
 
+	origClient := defaultHTTPClient
+	defaultHTTPClient = srv.Client()
+	defer func() { defaultHTTPClient = origClient }()
+
 	cfg := &types.TriggerConfig{
 		Type: types.TriggerHTTP,
 		HTTP: &types.HTTPTriggerConfig{Method: "GET", URL: srv.URL},
@@ -267,3 +291,18 @@ func TestExecuteCommand_EmptyCommand(t *testing.T) {
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "command is empty")
 }
+
+func TestExecuteCommand_DirectExec(t *testing.T) {
+	err := ExecuteCommand(context.Background(), "echo hello")
+	require.NoError(t, err)
+}
+
+func TestExecuteCommand_NoShellMetacharacters(t *testing.T) {
+	// The semicolon should be passed as a literal argument to echo, not
+	// interpreted as a shell command separator. With direct exec there is
+	// no shell to split on ";", so echo receives [";", "ls"] as arguments
+	// and prints them literally. If a shell were involved, "ls" would
+	// execute as a separate command.
+	err := ExecuteCommand(context.Background(), "echo ; ls")
+	require.NoError(t, err, "echo should succeed even with ; in args")
+}
diff --git a/internal/validation/engine.go b/internal/validation/engine.go
index 0979e1b..b693aca 100644
--- a/internal/validation/engine.go
+++ b/internal/validation/engine.go
@@ -4,6 +4,7 @@ package validation
 import (
 	"fmt"
 	"strconv"
+	"strings"
 	"time"
 
 	"github.com/dwsmith1983/interlock/pkg/types"
@@ -38,7 +39,7 @@ func EvaluateRules(mode string, rules []types.ValidationRule, sensors map[string
 	}
 
 	var passed bool
-	switch mode {
+	switch strings.ToUpper(mode) {
 	case "ANY":
 		passed = passCount > 0
 	default: // "ALL"
diff --git a/internal/validation/engine_test.go b/internal/validation/engine_test.go
index 2d1f7cb..1791321 100644
--- a/internal/validation/engine_test.go
+++ b/internal/validation/engine_test.go
@@ -448,6 +448,36 @@ func TestToFloat64_Unsupported(t *testing.T) {
 	assert.False(t, ok)
 }
 
+// --- BUG-8 characterization: lowercase "any" treated as "ALL" ---
+
+func TestEvaluateRules_LowercaseAny_TreatedAsAll(t *testing.T) {
+	// BUG-8 FIXED: lowercase "any" now works via strings.ToUpper.
+	rules := []types.ValidationRule{
+		{Key: "s1", Check: types.CheckGTE, Field: "count", Value: float64(10)},
+		{Key: "s2", Check: types.CheckGTE, Field: "count", Value: float64(10)},
+	}
+	sensors := map[string]map[string]interface{}{
+		"s1": {"count": float64(20)}, // passes
+		"s2": {"count": float64(5)},  // fails
+	}
+	result := EvaluateRules("any", rules, sensors, time.Now())
+	assert.True(t, result.Passed, "BUG-8 FIXED: lowercase 'any' now works")
+}
+
+func TestEvaluateRules_MixedCaseAny_TreatedAsAll(t *testing.T) {
+	// BUG-8 FIXED: mixed-case "Any" now works via strings.ToUpper.
+	rules := []types.ValidationRule{
+		{Key: "s1", Check: types.CheckGTE, Field: "count", Value: float64(10)},
+		{Key: "s2", Check: types.CheckGTE, Field: "count", Value: float64(10)},
+	}
+	sensors := map[string]map[string]interface{}{
+		"s1": {"count": float64(20)},
+		"s2": {"count": float64(5)},
+	}
+	result := EvaluateRules("Any", rules, sensors, time.Now())
+	assert.True(t, result.Passed, "BUG-8 FIXED: lowercase 'any' now works")
+}
+
 // --- EvaluateRules default mode (not "ALL" or "ANY") ---
 
 func TestEvaluateRules_DefaultMode_FallsToALL(t *testing.T) {
diff --git a/pkg/types/events.go b/pkg/types/events.go
index dd0c2cb..f712ea9 100644
--- a/pkg/types/events.go
+++ b/pkg/types/events.go
@@ -42,6 +42,10 @@ const (
 	EventDryRunSLAProjection     EventDetailType = "DRY_RUN_SLA_PROJECTION"
 	EventDryRunDrift             EventDetailType = "DRY_RUN_DRIFT"
 	EventDryRunCompleted         EventDetailType = "DRY_RUN_COMPLETED"
+	EventDryRunWouldRerun        EventDetailType = "DRY_RUN_WOULD_RERUN"
+	EventDryRunRerunRejected     EventDetailType = "DRY_RUN_RERUN_REJECTED"
+	EventDryRunWouldRetry        EventDetailType = "DRY_RUN_WOULD_RETRY"
+	EventDryRunRetryExhausted    EventDetailType = "DRY_RUN_RETRY_EXHAUSTED"
 )
 
 // EventSource is the EventBridge source for all interlock events.