From 7fe38852dc884fbdc4bbdbfa084addecd0d094ed Mon Sep 17 00:00:00 2001
From: chrispsheehan <chrispsheehan@gmail.com>
Date: Thu, 30 Apr 2026 12:55:20 +0100
Subject: [PATCH 1/4] feat: db backup profiles

---
 AGENTS.md                                     |   2 +
 README.md                                     |   1 +
 infra/live/dev/aws/database/terragrunt.hcl    |   2 +-
 infra/live/global_vars.hcl                    |   2 +
 infra/live/prod/aws/database/terragrunt.hcl   |   2 +-
 infra/modules/aws/_shared/database/README.md  |  80 +++++-
 infra/modules/aws/_shared/database/data.tf    |  53 ++++
 infra/modules/aws/_shared/database/local.tf   | 228 +++++++++++++++++-
 infra/modules/aws/_shared/database/main.tf    |  83 ++++++-
 infra/modules/aws/_shared/database/outputs.tf |  36 +++
 .../modules/aws/_shared/database/variables.tf |  37 ++-
 infra/modules/aws/database/README.md          |   5 +-
 infra/modules/aws/database/main.tf            |   3 +-
 infra/modules/aws/database/variables.tf       |  24 +-
 14 files changed, 538 insertions(+), 20 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 99d1c13b..8ad4f3d3 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -18,6 +18,8 @@ Update documentation in the same change:
 - keep `.github/docs/README.md` as the source of truth for workflow contracts and CI feasibility checks
 - prefer Mermaid diagrams in `.github/docs/README.md` that show jobs, `needs`, and reusable-workflow relationships rather than trying to reproduce the exact GitHub Actions UI
 - when adding a new AWS infra type or service family, check whether the deploy role in `infra/live/global_vars.hcl` needs additional `allowed_role_actions` and update it in the same change if required
+- when changing Terraform in a way that introduces any new AWS service surface area or API family, even inside an existing module, review `infra/live/global_vars.hcl` for required `allowed_role_actions` updates in the same change; do not limit this check only to obviously new top-level stack types
+- before closing any infra change that adds AWS resources, IAM principals, or orchestration services, explicitly verify whether it introduced new permissions for deploy-time creation or mutation and update `infra/live/global_vars.hcl` if needed
 - when changing the set of deployable Lambda or ECS runtimes, check whether the shared `observability` dashboard still reflects the current runtime surface and update it in the same change if needed
 
 ### Documentation Architecture
diff --git a/README.md b/README.md
index 3a064f1d..861c59e0 100644
--- a/README.md
+++ b/README.md
@@ -170,6 +170,7 @@ see [infra/README.md](infra/README.md#infra-deployment-use-cases).
 For Lambda provisioned concurrency patterns and example `provisioned_config` shapes, see [infra/modules/aws/_shared/lambda/README.md](infra/modules/aws/_shared/lambda/README.md).
 
 For ECS scaling patterns and `scaling_strategy` examples, see [infra/modules/aws/_shared/service/README.md](infra/modules/aws/_shared/service/README.md).
+For Aurora recovery posture presets such as `dev`, `standard`, and `critical`, plus the optional restore-drill Step Functions skeleton, see [infra/modules/aws/_shared/database/README.md](infra/modules/aws/_shared/database/README.md).
 
 ### Deployment Model
 
diff --git a/infra/live/dev/aws/database/terragrunt.hcl b/infra/live/dev/aws/database/terragrunt.hcl
index 87094184..4838a486 100644
--- a/infra/live/dev/aws/database/terragrunt.hcl
+++ b/infra/live/dev/aws/database/terragrunt.hcl
@@ -4,7 +4,7 @@ include "root" {
 
 inputs = {
   database_name                = "app"
-  backup_retention_period      = 1
+  recovery_class               = "dev"
   rds_min_capacity             = 0.5
   rds_max_capacity             = 1.0
   rds_max_reader_count         = 0
diff --git a/infra/live/global_vars.hcl b/infra/live/global_vars.hcl
index 56cecd78..85a13ed1 100644
--- a/infra/live/global_vars.hcl
+++ b/infra/live/global_vars.hcl
@@ -11,8 +11,10 @@ locals {
     "application-autoscaling:*",
     "cloudwatch:*",
     "events:*",
+    "scheduler:*",
     "sqs:*",
     "sns:*",
+    "states:*",
     "cloudfront:*",
     "xray:*",
     "ec2:*",
diff --git a/infra/live/prod/aws/database/terragrunt.hcl b/infra/live/prod/aws/database/terragrunt.hcl
index 4772e5da..8beca428 100644
--- a/infra/live/prod/aws/database/terragrunt.hcl
+++ b/infra/live/prod/aws/database/terragrunt.hcl
@@ -4,7 +4,7 @@ include "root" {
 
 inputs = {
   database_name                         = "app"
-  backup_retention_period               = 7
+  recovery_class                        = "standard" # "critical" for production workloads, "standard" for non-production workloads
   rds_min_capacity                      = 0.5
   rds_max_capacity                      = 2.0
   rds_max_reader_count                  = 1
diff --git a/infra/modules/aws/_shared/database/README.md b/infra/modules/aws/_shared/database/README.md
index ac32d7ca..c89f22ec 100644
--- a/infra/modules/aws/_shared/database/README.md
+++ b/infra/modules/aws/_shared/database/README.md
@@ -29,7 +29,8 @@ Shared Aurora PostgreSQL Serverless v2 module.
 - `publicly_accessible`
 - `database_port`
 - `engine_version`
-- `backup_retention_period`
+- `recovery_class`
+- `restore_drill`
 - `rds_min_capacity`
 - `rds_max_capacity`
 - `rds_max_reader_count`
@@ -45,6 +46,15 @@ Shared Aurora PostgreSQL Serverless v2 module.
 - `database_port`
 - `readonly_endpoint`
 - `readwrite_endpoint`
+- `recovery_class`
+- `restore_drill_cadence`
+- `target_rpo_minutes`
+- `target_rto_minutes`
+- `restore_drill_enabled`
+- `restore_drill_mode`
+- `restore_drill_schedule_expression`
+- `restore_drill_state_machine_arn`
+- `restore_drill_state_machine_name`
 
 This module is intentionally Aurora PostgreSQL Serverless v2 specific. It does not currently support provisioned RDS instances or non-Postgres engines.
 In this repo the concrete `database` wrapper resolves the VPC and public or private subnet ids, while the shared infra workflow injects `database_security_group_id` from the `security` stack via `TF_VAR_database_security_group_id`.
@@ -52,3 +62,71 @@ By default the module tracks the latest matching Aurora PostgreSQL 16.x engine v
 SSM parameter paths are rooted at `/<environment>/<project>/<database>/...` so they do not collide with AWS-reserved `/aws` prefixes.
 The runtime contract for database credentials is the Aurora-managed master secret exposed from the cluster. Terraform reads the managed secret ARN directly from the cluster resource rather than doing a separate Secrets Manager lookup during the same apply, because AWS may not populate that managed-secret reference early enough for an immediate data read.
 If you need new scale-out readers to inherit cluster tags, keep that automation in a separate stack such as `rds_reader_tagger` rather than pushing event-driven behavior into this shared database module.
+
+## Recovery Classes
+
+The shared module derives backup retention, deletion protection, final-snapshot behavior, minimum reader count, and recovery metadata from a single `recovery_class` input.
+
+### `dev`
+
+- 1 day of automated backup retention
+- deletion protection disabled
+- no final snapshot on destroy
+- no required reader instances
+- `restore_drill_cadence = "never"`
+
+### `standard`
+
+- 7 days of automated backup retention
+- deletion protection enabled
+- final snapshot required on destroy
+- at least 1 reader instance when multiple subnet AZs are available
+- `restore_drill_cadence = "monthly"`
+
+### `critical`
+
+- 35 days of automated backup retention
+- deletion protection enabled
+- final snapshot required on destroy
+- at least 2 reader instances when enough subnet AZs are available
+- `restore_drill_cadence = "weekly"`
+
+The module publishes `RecoveryClass`, `RestoreDrillCadence`, `TargetRPOMinutes`, and `TargetRTOMinutes` as cluster tags so operators can see the intended recovery posture directly on the Aurora cluster.
+
+## Restore Drill
+
+The shared module can also provision an opt-in restore-drill skeleton inside the same database module. When enabled, it creates:
+
+- a Step Functions state machine for manual restore-drill execution
+- an optional EventBridge Scheduler schedule when the mode includes scheduled runs
+- the IAM roles needed for the scheduler to start the state machine and for Step Functions to call RDS APIs
+
+Example:
+
+```hcl
+recovery_class = "standard"
+
+restore_drill = {
+  enabled      = true
+  mode         = "manual_and_scheduled"
+  use_pitr     = true
+  retain_hours = 4
+}
+```
+
+The schedule expression is derived from `recovery_class`:
+
+- `dev`: no automatic schedule
+- `standard`: `rate(30 days)`
+- `critical`: `rate(7 days)`
+
+The current Step Functions skeleton:
+
+1. restores a temporary Aurora cluster from PITR
+2. waits for the scratch cluster to become available
+3. creates one temporary writer instance
+4. waits for the instance to become available
+5. holds the restored environment for the configured retention window
+6. deletes the temporary instance and cluster
+
+This first version does not yet run application-level validation against the restored database. It proves restore orchestration and cleanup only. Add a dedicated validation Lambda or ECS task later once the restore path itself is stable.
diff --git a/infra/modules/aws/_shared/database/data.tf b/infra/modules/aws/_shared/database/data.tf
index 14f9d510..c46b766a 100644
--- a/infra/modules/aws/_shared/database/data.tf
+++ b/infra/modules/aws/_shared/database/data.tf
@@ -8,3 +8,56 @@ data "aws_subnet" "selected" {
   for_each = toset(var.subnet_ids)
   id       = each.value
 }
+
+data "aws_iam_policy_document" "restore_drill_sfn_assume" {
+  statement {
+    effect = "Allow"
+
+    principals {
+      type        = "Service"
+      identifiers = ["states.amazonaws.com"]
+    }
+
+    actions = ["sts:AssumeRole"]
+  }
+}
+
+data "aws_iam_policy_document" "restore_drill_sfn" {
+  statement {
+    sid    = "RestoreAndDescribeRds"
+    effect = "Allow"
+    actions = [
+      "rds:CreateDBInstance",
+      "rds:DeleteDBCluster",
+      "rds:DeleteDBInstance",
+      "rds:DescribeDBClusters",
+      "rds:DescribeDBInstances",
+      "rds:RestoreDBClusterToPointInTime",
+    ]
+    resources = ["*"]
+  }
+}
+
+data "aws_iam_policy_document" "restore_drill_scheduler_assume" {
+  statement {
+    effect = "Allow"
+
+    principals {
+      type        = "Service"
+      identifiers = ["scheduler.amazonaws.com"]
+    }
+
+    actions = ["sts:AssumeRole"]
+  }
+}
+
+data "aws_iam_policy_document" "restore_drill_scheduler" {
+  statement {
+    sid    = "StartRestoreDrillExecution"
+    effect = "Allow"
+    actions = [
+      "states:StartExecution",
+    ]
+    resources = aws_sfn_state_machine.restore_drill[*].arn
+  }
+}
diff --git a/infra/modules/aws/_shared/database/local.tf b/infra/modules/aws/_shared/database/local.tf
index f4bb4931..1ce9dbfe 100644
--- a/infra/modules/aws/_shared/database/local.tf
+++ b/infra/modules/aws/_shared/database/local.tf
@@ -16,7 +16,233 @@ locals {
   postgres_instance_class = "db.serverless"
   postgres_backup_window  = "07:00-09:00"
 
+  recovery_profiles = {
+    dev = {
+      backup_retention_period = 1
+      deletion_protection     = false
+      skip_final_snapshot     = true
+      final_snapshot_prefix   = null
+      restore_drill_cadence   = "never"
+      restore_drill_schedule  = null
+      target_rpo_minutes      = 1440
+      target_rto_minutes      = 240
+      minimum_reader_count    = 0
+    }
+    standard = {
+      backup_retention_period = 7
+      deletion_protection     = true
+      skip_final_snapshot     = false
+      final_snapshot_prefix   = "final"
+      restore_drill_cadence   = "monthly"
+      restore_drill_schedule  = "rate(30 days)"
+      target_rpo_minutes      = 15
+      target_rto_minutes      = 60
+      minimum_reader_count    = 1
+    }
+    critical = {
+      backup_retention_period = 35
+      deletion_protection     = true
+      skip_final_snapshot     = false
+      final_snapshot_prefix   = "final"
+      restore_drill_cadence   = "weekly"
+      restore_drill_schedule  = "rate(7 days)"
+      target_rpo_minutes      = 5
+      target_rto_minutes      = 30
+      minimum_reader_count    = 2
+    }
+  }
+
+  recovery_profile = local.recovery_profiles[var.recovery_class]
+
   subnet_ids_ordered = tolist(var.subnet_ids)
   subnet_azs_all     = [for id in local.subnet_ids_ordered : data.aws_subnet.selected[id].availability_zone]
-  subnet_azs         = slice(distinct(local.subnet_azs_all), 0, var.rds_max_reader_count)
+  subnet_azs         = distinct(local.subnet_azs_all)
+  reader_count       = min(length(local.subnet_azs), max(var.rds_max_reader_count, local.recovery_profile.minimum_reader_count))
+
+  final_snapshot_identifier = local.recovery_profile.skip_final_snapshot ? null : format(
+    "%s-%s",
+    local.cluster_identifier,
+    local.recovery_profile.final_snapshot_prefix,
+  )
+
+  restore_drill = merge(
+    {
+      enabled      = false
+      mode         = "manual"
+      use_pitr     = true
+      retain_hours = 4
+    },
+    {
+      schedule_expression = local.recovery_profile.restore_drill_schedule
+    },
+    var.restore_drill,
+  )
+
+  restore_drill_state_machine_enabled = local.restore_drill.enabled
+  restore_drill_schedule_enabled = local.restore_drill.enabled && contains(
+    ["scheduled", "manual_and_scheduled"],
+    local.restore_drill.mode,
+  ) && local.restore_drill.schedule_expression != null
+  restore_drill_identifier_prefix = substr("${local.cluster_identifier}-drill", 0, 30)
+  restore_drill_instance_class    = "db.serverless"
+  restore_drill_retention_seconds = local.restore_drill.retain_hours * 3600
+
+  restore_drill_state_machine_definition = jsonencode({
+    Comment = "Restore-drill skeleton for ${local.cluster_identifier}"
+    StartAt = "PrepareContext"
+    States = {
+      PrepareContext = {
+        Type = "Pass"
+        Parameters = {
+          "source_cluster_identifier"  = aws_rds_cluster.aurora_postgres.cluster_identifier
+          "db_subnet_group_name"       = aws_db_subnet_group.default.name
+          "vpc_security_group_ids"     = [var.database_security_group_id]
+          "publicly_accessible"        = var.publicly_accessible
+          "scratch_suffix.$"           = "States.ArrayGetItem(States.StringSplit(States.UUID(), '-'), 0)"
+          "use_latest_restorable_time" = local.restore_drill.use_pitr
+          "retention_seconds"          = local.restore_drill_retention_seconds
+        }
+        Next = "BuildIdentifiers"
+      }
+      BuildIdentifiers = {
+        Type = "Pass"
+        Parameters = {
+          "source_cluster_identifier.$"   = "$.source_cluster_identifier"
+          "db_subnet_group_name.$"        = "$.db_subnet_group_name"
+          "vpc_security_group_ids.$"      = "$.vpc_security_group_ids"
+          "publicly_accessible.$"         = "$.publicly_accessible"
+          "scratch_suffix.$"              = "$.scratch_suffix"
+          "use_latest_restorable_time.$"  = "$.use_latest_restorable_time"
+          "retention_seconds.$"           = "$.retention_seconds"
+          "restore_cluster_identifier.$"  = format("States.Format('{}-{}', '%s', $.scratch_suffix)", local.restore_drill_identifier_prefix)
+          "restore_instance_identifier.$" = format("States.Format('{}-{}-writer', '%s', $.scratch_suffix)", local.restore_drill_identifier_prefix)
+        }
+        Next = "StartRestore"
+      }
+      StartRestore = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:restoreDBClusterToPointInTime"
+        Parameters = {
+          "SourceDBClusterIdentifier.$" = "$.source_cluster_identifier"
+          "DBClusterIdentifier.$"       = "$.restore_cluster_identifier"
+          "RestoreType"                 = "copy-on-write"
+          "UseLatestRestorableTime.$"   = "$.use_latest_restorable_time"
+          "Engine"                      = local.postgres_engine
+          "DBSubnetGroupName.$"         = "$.db_subnet_group_name"
+          "VpcSecurityGroupIds.$"       = "$.vpc_security_group_ids"
+          "DeletionProtection"          = false
+          "Tags" = [
+            {
+              "Key"   = "RestoreDrill"
+              "Value" = "true"
+            },
+            {
+              "Key"   = "SourceCluster"
+              "Value" = aws_rds_cluster.aurora_postgres.cluster_identifier
+            },
+          ]
+        }
+        Next = "WaitForCluster"
+      }
+      WaitForCluster = {
+        Type    = "Wait"
+        Seconds = 60
+        Next    = "DescribeCluster"
+      }
+      DescribeCluster = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:describeDBClusters"
+        Parameters = {
+          "DBClusterIdentifier.$" = "$.restore_cluster_identifier"
+        }
+        ResultPath = "$.cluster_status"
+        Next       = "ClusterReady"
+      }
+      ClusterReady = {
+        Type = "Choice"
+        Choices = [
+          {
+            Variable     = "$.cluster_status.DBClusters[0].Status"
+            StringEquals = "available"
+            Next         = "CreateScratchInstance"
+          },
+        ]
+        Default = "WaitForCluster"
+      }
+      CreateScratchInstance = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:createDBInstance"
+        Parameters = {
+          "DBClusterIdentifier.$"  = "$.restore_cluster_identifier"
+          "DBInstanceIdentifier.$" = "$.restore_instance_identifier"
+          "DBInstanceClass"        = local.restore_drill_instance_class
+          "Engine"                 = local.postgres_engine
+          "PubliclyAccessible.$"   = "$.publicly_accessible"
+        }
+        Next = "WaitForInstance"
+      }
+      WaitForInstance = {
+        Type    = "Wait"
+        Seconds = 60
+        Next    = "DescribeInstance"
+      }
+      DescribeInstance = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:describeDBInstances"
+        Parameters = {
+          "DBInstanceIdentifier.$" = "$.restore_instance_identifier"
+        }
+        ResultPath = "$.instance_status"
+        Next       = "InstanceReady"
+      }
+      InstanceReady = {
+        Type = "Choice"
+        Choices = [
+          {
+            Variable     = "$.instance_status.DBInstances[0].DBInstanceStatus"
+            StringEquals = "available"
+            Next         = "RetentionWindow"
+          },
+        ]
+        Default = "WaitForInstance"
+      }
+      RetentionWindow = {
+        Type        = "Wait"
+        SecondsPath = "$.retention_seconds"
+        Next        = "DeleteScratchInstance"
+      }
+      DeleteScratchInstance = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:deleteDBInstance"
+        Parameters = {
+          "DBInstanceIdentifier.$" = "$.restore_instance_identifier"
+          "SkipFinalSnapshot"      = true
+          "DeleteAutomatedBackups" = true
+        }
+        Next = "WaitBeforeClusterDelete"
+      }
+      WaitBeforeClusterDelete = {
+        Type    = "Wait"
+        Seconds = 300
+        Next    = "DeleteScratchCluster"
+      }
+      DeleteScratchCluster = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:deleteDBCluster"
+        Parameters = {
+          "DBClusterIdentifier.$" = "$.restore_cluster_identifier"
+          "SkipFinalSnapshot"     = true
+        }
+        Retry = [
+          {
+            ErrorEquals     = ["States.ALL"]
+            IntervalSeconds = 120
+            MaxAttempts     = 10
+            BackoffRate     = 1.5
+          },
+        ]
+        End = true
+      }
+    }
+  })
 }
diff --git a/infra/modules/aws/_shared/database/main.tf b/infra/modules/aws/_shared/database/main.tf
index 2866f27a..12844986 100644
--- a/infra/modules/aws/_shared/database/main.tf
+++ b/infra/modules/aws/_shared/database/main.tf
@@ -20,13 +20,22 @@ resource "aws_rds_cluster" "aurora_postgres" {
   manage_master_user_password = true
 
   database_name           = local.serverless_database_name
-  backup_retention_period = var.backup_retention_period
+  backup_retention_period = local.recovery_profile.backup_retention_period
   preferred_backup_window = local.postgres_backup_window
+  deletion_protection     = local.recovery_profile.deletion_protection
 
-  skip_final_snapshot    = true
-  vpc_security_group_ids = [var.database_security_group_id]
-  db_subnet_group_name   = aws_db_subnet_group.default.name
-  storage_encrypted      = true
+  skip_final_snapshot       = local.recovery_profile.skip_final_snapshot
+  final_snapshot_identifier = local.final_snapshot_identifier
+  vpc_security_group_ids    = [var.database_security_group_id]
+  db_subnet_group_name      = aws_db_subnet_group.default.name
+  storage_encrypted         = true
+
+  tags = {
+    RecoveryClass       = var.recovery_class
+    RestoreDrillCadence = local.recovery_profile.restore_drill_cadence
+    TargetRPOMinutes    = tostring(local.recovery_profile.target_rpo_minutes)
+    TargetRTOMinutes    = tostring(local.recovery_profile.target_rto_minutes)
+  }
 
   serverlessv2_scaling_configuration {
     max_capacity = var.rds_max_capacity
@@ -49,7 +58,7 @@ resource "aws_rds_cluster_instance" "aurora_postgres_instance" {
 }
 
 resource "aws_rds_cluster_instance" "aurora_postgres_reader_instance" {
-  count = length(local.subnet_azs)
+  count = local.reader_count
 
   identifier           = format("%s-reader-%s", local.cluster_identifier, local.subnet_azs[count.index])
   cluster_identifier   = aws_rds_cluster.aurora_postgres.id
@@ -85,3 +94,65 @@ resource "aws_ssm_parameter" "db_readwrite_endpoint_parameter" {
   type        = "String"
   value       = aws_rds_cluster.aurora_postgres.endpoint
 }
+
+resource "aws_iam_role" "restore_drill_sfn" {
+  count = local.restore_drill_state_machine_enabled ? 1 : 0
+
+  name               = "${local.cluster_identifier}-restore-drill-sfn"
+  assume_role_policy = data.aws_iam_policy_document.restore_drill_sfn_assume.json
+}
+
+resource "aws_iam_role_policy" "restore_drill_sfn" {
+  count = local.restore_drill_state_machine_enabled ? 1 : 0
+
+  name   = "${local.cluster_identifier}-restore-drill-sfn"
+  role   = aws_iam_role.restore_drill_sfn[count.index].id
+  policy = data.aws_iam_policy_document.restore_drill_sfn.json
+}
+
+resource "aws_sfn_state_machine" "restore_drill" {
+  count = local.restore_drill_state_machine_enabled ? 1 : 0
+
+  name     = "${local.cluster_identifier}-restore-drill"
+  role_arn = aws_iam_role.restore_drill_sfn[count.index].arn
+
+  definition = local.restore_drill_state_machine_definition
+}
+
+resource "aws_iam_role" "restore_drill_scheduler" {
+  count = local.restore_drill_schedule_enabled ? 1 : 0
+
+  name               = "${local.cluster_identifier}-restore-drill-scheduler"
+  assume_role_policy = data.aws_iam_policy_document.restore_drill_scheduler_assume.json
+}
+
+resource "aws_iam_role_policy" "restore_drill_scheduler" {
+  count = local.restore_drill_schedule_enabled ? 1 : 0
+
+  name   = "${local.cluster_identifier}-restore-drill-scheduler"
+  role   = aws_iam_role.restore_drill_scheduler[count.index].id
+  policy = data.aws_iam_policy_document.restore_drill_scheduler.json
+}
+
+resource "aws_scheduler_schedule" "restore_drill" {
+  count = local.restore_drill_schedule_enabled ? 1 : 0
+
+  name                         = "${local.cluster_identifier}-restore-drill"
+  group_name                   = "default"
+  schedule_expression          = local.restore_drill.schedule_expression
+  schedule_expression_timezone = "UTC"
+  state                        = "ENABLED"
+
+  flexible_time_window {
+    mode = "OFF"
+  }
+
+  target {
+    arn      = aws_sfn_state_machine.restore_drill[count.index].arn
+    role_arn = aws_iam_role.restore_drill_scheduler[count.index].arn
+
+    input = jsonencode({
+      trigger_mode = "scheduled"
+    })
+  }
+}
diff --git a/infra/modules/aws/_shared/database/outputs.tf b/infra/modules/aws/_shared/database/outputs.tf
index 445b2282..5bea1015 100644
--- a/infra/modules/aws/_shared/database/outputs.tf
+++ b/infra/modules/aws/_shared/database/outputs.tf
@@ -33,3 +33,39 @@ output "readonly_endpoint" {
 output "readwrite_endpoint" {
   value = aws_rds_cluster.aurora_postgres.endpoint
 }
+
+output "recovery_class" {
+  value = var.recovery_class
+}
+
+output "restore_drill_cadence" {
+  value = local.recovery_profile.restore_drill_cadence
+}
+
+output "target_rpo_minutes" {
+  value = local.recovery_profile.target_rpo_minutes
+}
+
+output "target_rto_minutes" {
+  value = local.recovery_profile.target_rto_minutes
+}
+
+output "restore_drill_enabled" {
+  value = local.restore_drill.enabled
+}
+
+output "restore_drill_mode" {
+  value = local.restore_drill.mode
+}
+
+output "restore_drill_schedule_expression" {
+  value = try(local.restore_drill.schedule_expression, null)
+}
+
+output "restore_drill_state_machine_arn" {
+  value = try(aws_sfn_state_machine.restore_drill[0].arn, null)
+}
+
+output "restore_drill_state_machine_name" {
+  value = try(aws_sfn_state_machine.restore_drill[0].name, null)
+}
diff --git a/infra/modules/aws/_shared/database/variables.tf b/infra/modules/aws/_shared/database/variables.tf
index ec45c1ed..5ace2266 100644
--- a/infra/modules/aws/_shared/database/variables.tf
+++ b/infra/modules/aws/_shared/database/variables.tf
@@ -43,10 +43,39 @@ variable "engine_version" {
   default     = "16"
 }
 
-variable "backup_retention_period" {
-  type        = number
-  description = "Days to retain automated backups"
-  default     = 7
+variable "recovery_class" {
+  type        = string
+  description = "Recovery posture preset for the Aurora cluster."
+  default     = "standard"
+
+  validation {
+    condition     = contains(["dev", "standard", "critical"], var.recovery_class)
+    error_message = "recovery_class must be one of: dev, standard, critical."
+  }
+}
+
+variable "restore_drill" {
+  description = "Optional restore-drill automation for this Aurora cluster."
+  type = object({
+    enabled      = optional(bool, false)
+    mode         = optional(string, "manual")
+    use_pitr     = optional(bool, true)
+    retain_hours = optional(number, 4)
+  })
+  default = {}
+
+  validation {
+    condition = contains(
+      ["manual", "scheduled", "manual_and_scheduled"],
+      coalesce(var.restore_drill.mode, "manual"),
+    )
+    error_message = "restore_drill.mode must be manual, scheduled, or manual_and_scheduled."
+  }
+
+  validation {
+    condition     = coalesce(var.restore_drill.retain_hours, 4) >= 0
+    error_message = "restore_drill.retain_hours must be zero or greater."
+  }
 }
 
 variable "rds_min_capacity" {
diff --git a/infra/modules/aws/database/README.md b/infra/modules/aws/database/README.md
index 3f4cf9c0..fc50e0f0 100644
--- a/infra/modules/aws/database/README.md
+++ b/infra/modules/aws/database/README.md
@@ -20,7 +20,8 @@ Concrete Aurora PostgreSQL wrapper.
 - `database_security_group_id`
 - `database_port`
 - `engine_version`
-- `backup_retention_period`
+- `recovery_class`
+- `restore_drill`
 - `rds_min_capacity`
 - `rds_max_capacity`
 - `rds_max_reader_count`
@@ -40,3 +41,5 @@ Concrete Aurora PostgreSQL wrapper.
 This module keeps repo-specific network lookup logic out of `_shared/database`. It selects public or private subnets by `tag:Name` based on `publicly_accessible` and passes the resulting subnet ids into the shared Aurora module.
 The database credentials outputs point at the Aurora-managed master secret rather than a repo-created fixed-name secret.
 Aurora reader instances created later by scale-out can be paired with the separate `rds_reader_tagger` stack so new readers inherit the cluster's non-AWS tags.
+Use `recovery_class` as the main resilience input and let the shared module derive retention, final-snapshot, deletion-protection, and reader-minimum defaults from that class.
+Use `restore_drill` when you want the shared module to also provision the optional restore-drill Step Functions skeleton and any class-derived schedule.
diff --git a/infra/modules/aws/database/main.tf b/infra/modules/aws/database/main.tf
index 82415730..8c1c4324 100644
--- a/infra/modules/aws/database/main.tf
+++ b/infra/modules/aws/database/main.tf
@@ -9,7 +9,8 @@ module "database" {
   database_port       = var.database_port
   engine_version      = var.engine_version
 
-  backup_retention_period               = var.backup_retention_period
+  recovery_class                        = var.recovery_class
+  restore_drill                         = var.restore_drill
   rds_min_capacity                      = var.rds_min_capacity
   rds_max_capacity                      = var.rds_max_capacity
   performance_insights_enabled          = var.performance_insights_enabled
diff --git a/infra/modules/aws/database/variables.tf b/infra/modules/aws/database/variables.tf
index 7bcdeefb..7d5fd0a1 100644
--- a/infra/modules/aws/database/variables.tf
+++ b/infra/modules/aws/database/variables.tf
@@ -43,10 +43,26 @@ variable "engine_version" {
   default     = "16"
 }
 
-variable "backup_retention_period" {
-  type        = number
-  description = "Days to retain automated backups"
-  default     = 7
+variable "recovery_class" {
+  type        = string
+  description = "Recovery posture preset passed through to the shared Aurora module."
+  default     = "standard"
+
+  validation {
+    condition     = contains(["dev", "standard", "critical"], var.recovery_class)
+    error_message = "recovery_class must be one of: dev, standard, critical."
+  }
+}
+
+variable "restore_drill" {
+  description = "Optional restore-drill automation passed through to the shared Aurora module."
+  type = object({
+    enabled      = optional(bool, false)
+    mode         = optional(string, "manual")
+    use_pitr     = optional(bool, true)
+    retain_hours = optional(number, 4)
+  })
+  default = {}
 }
 
 variable "rds_min_capacity" {

From 8b5f691d1e7e8b5ca241e881043590160625635d Mon Sep 17 00:00:00 2001
From: chrispsheehan <chrispsheehan@gmail.com>
Date: Thu, 30 Apr 2026 13:13:07 +0100
Subject: [PATCH 2/4] chore: warning for backup profile changes

---
 AGENTS.md                                    | 2 ++
 infra/modules/aws/_shared/database/README.md | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index 8ad4f3d3..934599c7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -21,6 +21,8 @@ Update documentation in the same change:
 - when changing Terraform in a way that introduces any new AWS service surface area or API family, even inside an existing module, review `infra/live/global_vars.hcl` for required `allowed_role_actions` updates in the same change; do not limit this check only to obviously new top-level stack types
 - before closing any infra change that adds AWS resources, IAM principals, or orchestration services, explicitly verify whether it introduced new permissions for deploy-time creation or mutation and update `infra/live/global_vars.hcl` if needed
 - when changing the set of deployable Lambda or ECS runtimes, check whether the shared `observability` dashboard still reflects the current runtime surface and update it in the same change if needed
+- when changing `infra/modules/aws/_shared/database/**` recovery behavior, restore-drill behavior, backup retention, reader defaults, or other resilience knobs, include a rough cost comparison in the final response that contrasts `dev`, `standard`, and `critical`; keep it qualitative unless current pricing was explicitly requested, and call out that Aurora scratch restore compute/storage dominates drill cost more than Step Functions or Scheduler
+- when changing a live database stack's `recovery_class`, include a short conspicuous ANSI-colored warning block in the final response in the form "you have changed from X to Y" followed by a brief note about likely cost direction such as higher backup storage, more required readers, or more frequent restore drills; keep it short and awareness-focused rather than explanatory
 
 ### Documentation Architecture
 
diff --git a/infra/modules/aws/_shared/database/README.md b/infra/modules/aws/_shared/database/README.md
index c89f22ec..d89ef2b8 100644
--- a/infra/modules/aws/_shared/database/README.md
+++ b/infra/modules/aws/_shared/database/README.md
@@ -120,6 +120,14 @@ The schedule expression is derived from `recovery_class`:
 - `standard`: `rate(30 days)`
 - `critical`: `rate(7 days)`
 
+Rough cost guidance by recovery class:
+
+- `dev`: lowest ongoing cost; 1-day automated backups, no final snapshot on destroy, no required reader instances, no scheduled drill by default
+- `standard`: moderate cost increase; 7-day backups, final snapshot on destroy, at least 1 reader when multiple subnet AZs are available, monthly scheduled drill if enabled
+- `critical`: highest ongoing cost; 35-day backups, final snapshot on destroy, at least 2 readers when enough subnet AZs are available, weekly scheduled drill if enabled
+
+The largest drill-related cost is the temporary restored Aurora cluster and scratch writer instance. Step Functions and EventBridge Scheduler usually contribute negligible cost compared with Aurora compute and storage.
+
 The current Step Functions skeleton:
 
 1. restores a temporary Aurora cluster from PITR

From 29f231a84c3e44dc88eb94b1ff609fdc1b489164 Mon Sep 17 00:00:00 2001
From: chrispsheehan <chrispsheehan@gmail.com>
Date: Thu, 30 Apr 2026 15:52:34 +0100
Subject: [PATCH 3/4] feat: state machine for backups

---
 infra/modules/aws/_shared/database/README.md  | 27 ++++++
 infra/modules/aws/_shared/database/data.tf    | 10 +++
 infra/modules/aws/_shared/database/local.tf   | 87 ++++++++++++++++++-
 infra/modules/aws/_shared/database/main.tf    | 13 ++-
 infra/modules/aws/_shared/database/outputs.tf | 12 +++
 .../modules/aws/_shared/database/variables.tf |  8 ++
 infra/modules/aws/database/README.md          |  2 +
 infra/modules/aws/database/main.tf            |  1 +
 infra/modules/aws/database/variables.tf       |  8 ++
 9 files changed, 163 insertions(+), 5 deletions(-)

diff --git a/infra/modules/aws/_shared/database/README.md b/infra/modules/aws/_shared/database/README.md
index d89ef2b8..dba8f4bf 100644
--- a/infra/modules/aws/_shared/database/README.md
+++ b/infra/modules/aws/_shared/database/README.md
@@ -31,6 +31,7 @@ Shared Aurora PostgreSQL Serverless v2 module.
 - `engine_version`
 - `recovery_class`
 - `restore_drill`
+- `manual_snapshot`
 - `rds_min_capacity`
 - `rds_max_capacity`
 - `rds_max_reader_count`
@@ -55,6 +56,9 @@ Shared Aurora PostgreSQL Serverless v2 module.
 - `restore_drill_schedule_expression`
 - `restore_drill_state_machine_arn`
 - `restore_drill_state_machine_name`
+- `manual_snapshot_enabled`
+- `manual_snapshot_state_machine_arn`
+- `manual_snapshot_state_machine_name`
 
 This module is intentionally Aurora PostgreSQL Serverless v2 specific. It does not currently support provisioned RDS instances or non-Postgres engines.
 In this repo the concrete `database` wrapper resolves the VPC and public or private subnet ids, while the shared infra workflow injects `database_security_group_id` from the `security` stack via `TF_VAR_database_security_group_id`.
@@ -138,3 +142,26 @@ The current Step Functions skeleton:
 6. deletes the temporary instance and cluster
 
 This first version does not yet run application-level validation against the restored database. It proves restore orchestration and cleanup only. Add a dedicated validation Lambda or ECS task later once the restore path itself is stable.
+
+## Manual Snapshot
+
+The shared module can also provision an opt-in manual snapshot trigger. This is separate from the restore drill:
+
+- `manual_snapshot` creates a named Aurora cluster snapshot on demand
+- `restore_drill` restores a temporary cluster and validates the recovery path
+
+Example:
+
+```hcl
+manual_snapshot = {
+  enabled = true
+}
+```
+
+When enabled, the module creates a second Step Functions state machine that:
+
+1. builds a unique snapshot identifier
+2. creates a manual Aurora cluster snapshot
+3. waits until the snapshot reaches `available`
+
+Use the `manual_snapshot_state_machine_arn` or `manual_snapshot_state_machine_name` output to start it manually from the Step Functions console or CLI.
diff --git a/infra/modules/aws/_shared/database/data.tf b/infra/modules/aws/_shared/database/data.tf
index c46b766a..a11b3fcd 100644
--- a/infra/modules/aws/_shared/database/data.tf
+++ b/infra/modules/aws/_shared/database/data.tf
@@ -36,6 +36,16 @@ data "aws_iam_policy_document" "restore_drill_sfn" {
     ]
     resources = ["*"]
   }
+
+  statement {
+    sid    = "CreateAndDescribeManualSnapshots"
+    effect = "Allow"
+    actions = [
+      "rds:CreateDBClusterSnapshot",
+      "rds:DescribeDBClusterSnapshots",
+    ]
+    resources = ["*"]
+  }
 }
 
 data "aws_iam_policy_document" "restore_drill_scheduler_assume" {
diff --git a/infra/modules/aws/_shared/database/local.tf b/infra/modules/aws/_shared/database/local.tf
index 1ce9dbfe..d7a95980 100644
--- a/infra/modules/aws/_shared/database/local.tf
+++ b/infra/modules/aws/_shared/database/local.tf
@@ -78,14 +78,24 @@ locals {
     var.restore_drill,
   )
 
+  manual_snapshot = merge(
+    {
+      enabled = false
+    },
+    var.manual_snapshot,
+  )
+
   restore_drill_state_machine_enabled = local.restore_drill.enabled
   restore_drill_schedule_enabled = local.restore_drill.enabled && contains(
     ["scheduled", "manual_and_scheduled"],
     local.restore_drill.mode,
   ) && local.restore_drill.schedule_expression != null
-  restore_drill_identifier_prefix = substr("${local.cluster_identifier}-drill", 0, 30)
-  restore_drill_instance_class    = "db.serverless"
-  restore_drill_retention_seconds = local.restore_drill.retain_hours * 3600
+  manual_snapshot_state_machine_enabled = local.manual_snapshot.enabled
+  snapshot_workflow_role_enabled        = local.restore_drill_state_machine_enabled || local.manual_snapshot_state_machine_enabled
+  restore_drill_identifier_prefix       = substr("${local.cluster_identifier}-drill", 0, 30)
+  manual_snapshot_identifier_prefix     = substr("${local.cluster_identifier}-manual", 0, 32)
+  restore_drill_instance_class          = "db.serverless"
+  restore_drill_retention_seconds       = local.restore_drill.retain_hours * 3600
 
   restore_drill_state_machine_definition = jsonencode({
     Comment = "Restore-drill skeleton for ${local.cluster_identifier}"
@@ -245,4 +255,75 @@ locals {
       }
     }
   })
+
+  manual_snapshot_state_machine_definition = jsonencode({
+    Comment = "Manual snapshot trigger for ${local.cluster_identifier}"
+    StartAt = "PrepareSnapshotContext"
+    States = {
+      PrepareSnapshotContext = {
+        Type = "Pass"
+        Parameters = {
+          "source_cluster_identifier" = aws_rds_cluster.aurora_postgres.cluster_identifier
+          "snapshot_suffix.$"         = "States.ArrayGetItem(States.StringSplit(States.UUID(), '-'), 0)"
+        }
+        Next = "BuildSnapshotIdentifier"
+      }
+      BuildSnapshotIdentifier = {
+        Type = "Pass"
+        Parameters = {
+          "source_cluster_identifier.$" = "$.source_cluster_identifier"
+          "snapshot_suffix.$"           = "$.snapshot_suffix"
+          "snapshot_identifier.$"       = format("States.Format('{}-{}', '%s', $.snapshot_suffix)", local.manual_snapshot_identifier_prefix)
+        }
+        Next = "CreateClusterSnapshot"
+      }
+      CreateClusterSnapshot = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:createDBClusterSnapshot"
+        Parameters = {
+          "DBClusterIdentifier.$"         = "$.source_cluster_identifier"
+          "DBClusterSnapshotIdentifier.$" = "$.snapshot_identifier"
+          "Tags" = [
+            {
+              "Key"   = "ManualSnapshot"
+              "Value" = "true"
+            },
+            {
+              "Key"   = "SourceCluster"
+              "Value" = aws_rds_cluster.aurora_postgres.cluster_identifier
+            },
+          ]
+        }
+        Next = "WaitForSnapshot"
+      }
+      WaitForSnapshot = {
+        Type    = "Wait"
+        Seconds = 60
+        Next    = "DescribeSnapshot"
+      }
+      DescribeSnapshot = {
+        Type     = "Task"
+        Resource = "arn:aws:states:::aws-sdk:rds:describeDBClusterSnapshots"
+        Parameters = {
+          "DBClusterSnapshotIdentifier.$" = "$.snapshot_identifier"
+        }
+        ResultPath = "$.snapshot_status"
+        Next       = "SnapshotReady"
+      }
+      SnapshotReady = {
+        Type = "Choice"
+        Choices = [
+          {
+            Variable     = "$.snapshot_status.DBClusterSnapshots[0].Status"
+            StringEquals = "available"
+            Next         = "SnapshotCreated"
+          },
+        ]
+        Default = "WaitForSnapshot"
+      }
+      SnapshotCreated = {
+        Type = "Succeed"
+      }
+    }
+  })
 }
diff --git a/infra/modules/aws/_shared/database/main.tf b/infra/modules/aws/_shared/database/main.tf
index 12844986..40dbbda9 100644
--- a/infra/modules/aws/_shared/database/main.tf
+++ b/infra/modules/aws/_shared/database/main.tf
@@ -96,14 +96,14 @@ resource "aws_ssm_parameter" "db_readwrite_endpoint_parameter" {
 }
 
 resource "aws_iam_role" "restore_drill_sfn" {
-  count = local.restore_drill_state_machine_enabled ? 1 : 0
+  count = local.snapshot_workflow_role_enabled ? 1 : 0
 
   name               = "${local.cluster_identifier}-restore-drill-sfn"
   assume_role_policy = data.aws_iam_policy_document.restore_drill_sfn_assume.json
 }
 
 resource "aws_iam_role_policy" "restore_drill_sfn" {
-  count = local.restore_drill_state_machine_enabled ? 1 : 0
+  count = local.snapshot_workflow_role_enabled ? 1 : 0
 
   name   = "${local.cluster_identifier}-restore-drill-sfn"
   role   = aws_iam_role.restore_drill_sfn[count.index].id
@@ -119,6 +119,15 @@ resource "aws_sfn_state_machine" "restore_drill" {
   definition = local.restore_drill_state_machine_definition
 }
 
+resource "aws_sfn_state_machine" "manual_snapshot" {
+  count = local.manual_snapshot_state_machine_enabled ? 1 : 0
+
+  name     = "${local.cluster_identifier}-manual-snapshot"
+  role_arn = aws_iam_role.restore_drill_sfn[0].arn
+
+  definition = local.manual_snapshot_state_machine_definition
+}
+
 resource "aws_iam_role" "restore_drill_scheduler" {
   count = local.restore_drill_schedule_enabled ? 1 : 0
 
diff --git a/infra/modules/aws/_shared/database/outputs.tf b/infra/modules/aws/_shared/database/outputs.tf
index 5bea1015..b062f722 100644
--- a/infra/modules/aws/_shared/database/outputs.tf
+++ b/infra/modules/aws/_shared/database/outputs.tf
@@ -69,3 +69,15 @@ output "restore_drill_state_machine_arn" {
 output "restore_drill_state_machine_name" {
   value = try(aws_sfn_state_machine.restore_drill[0].name, null)
 }
+
+output "manual_snapshot_enabled" {
+  value = local.manual_snapshot.enabled
+}
+
+output "manual_snapshot_state_machine_arn" {
+  value = try(aws_sfn_state_machine.manual_snapshot[0].arn, null)
+}
+
+output "manual_snapshot_state_machine_name" {
+  value = try(aws_sfn_state_machine.manual_snapshot[0].name, null)
+}
diff --git a/infra/modules/aws/_shared/database/variables.tf b/infra/modules/aws/_shared/database/variables.tf
index 5ace2266..f4d5be14 100644
--- a/infra/modules/aws/_shared/database/variables.tf
+++ b/infra/modules/aws/_shared/database/variables.tf
@@ -78,6 +78,14 @@ variable "restore_drill" {
   }
 }
 
+variable "manual_snapshot" {
+  description = "Optional manual snapshot trigger for this Aurora cluster."
+  type = object({
+    enabled = optional(bool, false)
+  })
+  default = {}
+}
+
 variable "rds_min_capacity" {
   type        = number
   description = "Minimum Aurora Serverless v2 capacity in ACUs"
diff --git a/infra/modules/aws/database/README.md b/infra/modules/aws/database/README.md
index fc50e0f0..0d8e65d3 100644
--- a/infra/modules/aws/database/README.md
+++ b/infra/modules/aws/database/README.md
@@ -22,6 +22,7 @@ Concrete Aurora PostgreSQL wrapper.
 - `engine_version`
 - `recovery_class`
 - `restore_drill`
+- `manual_snapshot`
 - `rds_min_capacity`
 - `rds_max_capacity`
 - `rds_max_reader_count`
@@ -43,3 +44,4 @@ The database credentials outputs point at the Aurora-managed master secret rathe
 Aurora reader instances created later by scale-out can be paired with the separate `rds_reader_tagger` stack so new readers inherit the cluster's non-AWS tags.
 Use `recovery_class` as the main resilience input and let the shared module derive retention, final-snapshot, deletion-protection, and reader-minimum defaults from that class.
 Use `restore_drill` when you want the shared module to also provision the optional restore-drill Step Functions skeleton and any class-derived schedule.
+Use `manual_snapshot` when you want the shared module to also provision a separate on-demand manual snapshot Step Functions trigger.
diff --git a/infra/modules/aws/database/main.tf b/infra/modules/aws/database/main.tf
index 8c1c4324..ab7e13a6 100644
--- a/infra/modules/aws/database/main.tf
+++ b/infra/modules/aws/database/main.tf
@@ -11,6 +11,7 @@ module "database" {
 
   recovery_class                        = var.recovery_class
   restore_drill                         = var.restore_drill
+  manual_snapshot                       = var.manual_snapshot
   rds_min_capacity                      = var.rds_min_capacity
   rds_max_capacity                      = var.rds_max_capacity
   performance_insights_enabled          = var.performance_insights_enabled
diff --git a/infra/modules/aws/database/variables.tf b/infra/modules/aws/database/variables.tf
index 7d5fd0a1..76017e4e 100644
--- a/infra/modules/aws/database/variables.tf
+++ b/infra/modules/aws/database/variables.tf
@@ -65,6 +65,14 @@ variable "restore_drill" {
   default = {}
 }
 
+variable "manual_snapshot" {
+  description = "Optional manual snapshot trigger passed through to the shared Aurora module."
+  type = object({
+    enabled = optional(bool, false)
+  })
+  default = {}
+}
+
 variable "rds_min_capacity" {
   type        = number
   description = "Minimum Aurora Serverless v2 capacity in ACUs"

From aa170252b9331dcfd7507bb09f1d0500d02394b7 Mon Sep 17 00:00:00 2001
From: chrispsheehan <chrispsheehan@gmail.com>
Date: Thu, 30 Apr 2026 16:51:48 +0100
Subject: [PATCH 4/4] chore: add destroy step for snapshots

---
 .github/docs/README.md                        |  3 +-
 .github/workflows/destroy.yml                 | 25 +++++++++++++
 infra/modules/aws/_shared/database/README.md  |  2 ++
 infra/modules/aws/_shared/database/outputs.tf |  4 +++
 infra/modules/aws/database/README.md          |  1 +
 infra/modules/aws/database/outputs.tf         |  4 +++
 justfile.deploy                               | 35 +++++++++++++++++++
 7 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/.github/docs/README.md b/.github/docs/README.md
index 59171313..afa8a927 100644
--- a/.github/docs/README.md
+++ b/.github/docs/README.md
@@ -128,7 +128,7 @@ flowchart LR
 ### Cleanup And Discovery
 
 - `destroy.yml`
-  Tears down app layers before shared dependencies, including the shared observability dashboard and any environment-owned shared artifact stacks such as the `dev` code bucket.
+  Tears down app layers before shared dependencies, including the shared observability dashboard and any environment-owned shared artifact stacks such as the `dev` code bucket. In the `database` job, `dev` now runs `tg_action: init` first to read Terraform outputs from the database stack, then passes `cluster_identifier` and `manual_snapshot_identifier_prefix` into `justfile.deploy` so the cleanup recipe deletes only repo-owned manual Aurora cluster snapshots before Terragrunt destroy. `prod` intentionally retains those manual snapshots.
 - `shared_directories_get.yml`
   Derives the directory-based matrices used by wrapper workflows and PR action-test discovery.
 
@@ -206,6 +206,7 @@ Run these checks on every CI, workflow, or deploy-contract change.
 - confirm destroy ordering still removes downstream consumers before shared stacks
 - check required Terraform variables on destroy as well as apply
 - prefer depending on real downstream consumers rather than serializing unrelated shared stacks
+- when a runtime or module creates manual backup artifacts outside Terraform resource ownership, decide explicitly whether destroy should delete or retain them by environment and keep that behavior documented in `destroy.yml` contracts
 
 ## Wrapper Workflow Summary
 
diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml
index cf9c7075..92deee1e 100644
--- a/.github/workflows/destroy.yml
+++ b/.github/workflows/destroy.yml
@@ -206,6 +206,31 @@ jobs:
           role-to-assume: ${{ env.AWS_OIDC_ROLE_ARN }}
           aws-region: ${{ env.AWS_REGION }}
 
+      - name: Get database infra outputs
+        if: inputs.environment != 'prod'
+        id: get-database
+        uses: ./.github/actions/terragrunt
+        env:
+          TF_VAR_database_security_group_id: "destroy-placeholder"
+        with:
+          tg_directory: infra/live/${{ inputs.environment }}/aws/database
+          tg_action: init
+
+      - name: Delete dev manual database snapshots
+        if: inputs.environment != 'prod'
+        uses: ./.github/actions/just
+        env:
+          TG_OUTPUTS: ${{ steps.get-database.outputs.tg_outputs }}
+          CLUSTER_IDENTIFIER: ${{ fromJson(steps.get-database.outputs.tg_outputs).cluster_identifier.value }}
+          MANUAL_SNAPSHOT_PREFIX: ${{ fromJson(steps.get-database.outputs.tg_outputs).manual_snapshot_identifier_prefix.value }}
+        with:
+          justfile_path: justfile.deploy
+          just_action: database-delete-manual-snapshots
+
+      - name: Keep prod manual database snapshots
+        if: inputs.environment == 'prod'
+        run: echo "Retaining prod manual database snapshots."
+
       - name: Destroy database infra
         uses: ./.github/actions/terragrunt
         env:
diff --git a/infra/modules/aws/_shared/database/README.md b/infra/modules/aws/_shared/database/README.md
index dba8f4bf..3d8a1049 100644
--- a/infra/modules/aws/_shared/database/README.md
+++ b/infra/modules/aws/_shared/database/README.md
@@ -59,6 +59,7 @@ Shared Aurora PostgreSQL Serverless v2 module.
 - `manual_snapshot_enabled`
 - `manual_snapshot_state_machine_arn`
 - `manual_snapshot_state_machine_name`
+- `manual_snapshot_identifier_prefix`
 
 This module is intentionally Aurora PostgreSQL Serverless v2 specific. It does not currently support provisioned RDS instances or non-Postgres engines.
 In this repo the concrete `database` wrapper resolves the VPC and public or private subnet ids, while the shared infra workflow injects `database_security_group_id` from the `security` stack via `TF_VAR_database_security_group_id`.
@@ -165,3 +166,4 @@ When enabled, the module creates a second Step Functions state machine that:
 3. waits until the snapshot reaches `available`
 
 Use the `manual_snapshot_state_machine_arn` or `manual_snapshot_state_machine_name` output to start it manually from the Step Functions console or CLI.
+The module also exposes `manual_snapshot_identifier_prefix` so destroy or cleanup paths can delete only the repo-owned manual snapshots without re-deriving the naming contract outside Terraform.
diff --git a/infra/modules/aws/_shared/database/outputs.tf b/infra/modules/aws/_shared/database/outputs.tf
index b062f722..d97daa9b 100644
--- a/infra/modules/aws/_shared/database/outputs.tf
+++ b/infra/modules/aws/_shared/database/outputs.tf
@@ -81,3 +81,7 @@ output "manual_snapshot_state_machine_arn" {
 output "manual_snapshot_state_machine_name" {
   value = try(aws_sfn_state_machine.manual_snapshot[0].name, null)
 }
+
+output "manual_snapshot_identifier_prefix" {
+  value = local.manual_snapshot_identifier_prefix
+}
diff --git a/infra/modules/aws/database/README.md b/infra/modules/aws/database/README.md
index 0d8e65d3..fa354e59 100644
--- a/infra/modules/aws/database/README.md
+++ b/infra/modules/aws/database/README.md
@@ -38,6 +38,7 @@ Concrete Aurora PostgreSQL wrapper.
 - `database_port`
 - `readonly_endpoint`
 - `readwrite_endpoint`
+- `manual_snapshot_identifier_prefix`
 
 This module keeps repo-specific network lookup logic out of `_shared/database`. It selects public or private subnets by `tag:Name` based on `publicly_accessible` and passes the resulting subnet ids into the shared Aurora module.
 The database credentials outputs point at the Aurora-managed master secret rather than a repo-created fixed-name secret.
diff --git a/infra/modules/aws/database/outputs.tf b/infra/modules/aws/database/outputs.tf
index 93afbf47..4950fa17 100644
--- a/infra/modules/aws/database/outputs.tf
+++ b/infra/modules/aws/database/outputs.tf
@@ -33,3 +33,7 @@ output "readonly_endpoint" {
 output "readwrite_endpoint" {
   value = module.database.readwrite_endpoint
 }
+
+output "manual_snapshot_identifier_prefix" {
+  value = module.database.manual_snapshot_identifier_prefix
+}
diff --git a/justfile.deploy b/justfile.deploy
index 1f989600..092c4d5c 100644
--- a/justfile.deploy
+++ b/justfile.deploy
@@ -667,3 +667,38 @@ ecs-rolling-deploy:
         --services "$SERVICE_NAME"
 
     echo "✅ ECS rolling deployment completed for $SERVICE_NAME"
+
+
+# Delete repo-owned manual Aurora cluster snapshots for an environment.
+database-delete-manual-snapshots:
+    #!/usr/bin/env bash
+    set -euo pipefail
+
+    if [[ -z "${CLUSTER_IDENTIFIER:-}" ]]; then
+        echo "❌ CLUSTER_IDENTIFIER environment variable is not set."
+        exit 1
+    fi
+
+    if [[ -z "${MANUAL_SNAPSHOT_PREFIX:-}" ]]; then
+        echo "❌ MANUAL_SNAPSHOT_PREFIX environment variable is not set."
+        exit 1
+    fi
+
+    snapshot_ids="$(
+        aws rds describe-db-cluster-snapshots \
+          --db-cluster-identifier "$CLUSTER_IDENTIFIER" \
+          --snapshot-type manual \
+          --query "DBClusterSnapshots[?starts_with(DBClusterSnapshotIdentifier, \`${MANUAL_SNAPSHOT_PREFIX}\`)].DBClusterSnapshotIdentifier" \
+          --output text || true
+    )"
+
+    if [[ -z "$snapshot_ids" || "$snapshot_ids" == "None" ]]; then
+        echo "No repo-owned manual snapshots found for $CLUSTER_IDENTIFIER"
+        exit 0
+    fi
+
+    for snapshot_id in $snapshot_ids; do
+        echo "Deleting manual snapshot $snapshot_id"
+        aws rds delete-db-cluster-snapshot \
+          --db-cluster-snapshot-identifier "$snapshot_id"
+    done