From 085964ac38c51559cf583e68469cc20c750d4962 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 15:28:17 -0300 Subject: [PATCH 01/24] feat: add dynamic assume role support via scope-configurations provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When assume_role.arn is set in the scope-configurations provider, the agent's base credentials (IRSA) are used only to call sts:AssumeRole; all subsequent AWS calls (CLI + Tofu) run under the target role. Falls back to ASSUME_ROLE_ARN_DEFAULT in values.yaml if the provider key is absent. When neither is set, behavior is unchanged — pod credentials (IRSA) are used directly. - New utils/assume_role: sourceable helper that exports temporary credentials - fetch_scope_configuration: reads assume_role.arn from scope-configurations provider and applies the role immediately after config is fetched - diagnose/build_context: explicit assume_role sourcing (only build_context that bypasses fetch_scope_configuration) - values.yaml: documents ASSUME_ROLE_ARN_DEFAULT as fallback config option Co-Authored-By: Claude Sonnet 4.6 --- lambda/diagnose/build_context | 1 + lambda/utils/assume_role | 34 ++++++++++++++++++++++++++ lambda/utils/fetch_scope_configuration | 9 +++++++ lambda/values.yaml | 7 ++++++ 4 files changed, 51 insertions(+) create mode 100755 lambda/utils/assume_role diff --git a/lambda/diagnose/build_context b/lambda/diagnose/build_context index 1ef14b6..f234a67 100755 --- a/lambda/diagnose/build_context +++ b/lambda/diagnose/build_context @@ -15,6 +15,7 @@ if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then fi source "$SERVICE_PATH/utils/lambda_function_name" +source "$SERVICE_PATH/utils/assume_role" lambda_info=$(aws lambda get-function --function-name "$LAMBDA_FUNCTION_NAME" --output json 2>/dev/null || echo "{}") LAMBDA_FUNCTION_ARN=$(echo "$lambda_info" | jq -r '.Configuration.FunctionArn // ""') diff --git a/lambda/utils/assume_role b/lambda/utils/assume_role new file mode 100755 index 0000000..66cfe16 --- /dev/null +++ b/lambda/utils/assume_role @@ -0,0 +1,34 @@ +#!/bin/bash +# Sourceable helper — do NOT execute directly. +# Reads ASSUME_ROLE_ARN from environment. If set, calls sts:AssumeRole and exports +# temporary credentials so all subsequent AWS calls (CLI + Tofu) use that role. +# If empty, does nothing — pod's IRSA handles auth directly. +# +# Requires: aws CLI, jq +# Expects: ASSUME_ROLE_ARN (exported by fetch_scope_configuration or values.yaml) +# SCOPE_ID (optional, used for the session name) + +_ar_log() { + if declare -f log > /dev/null 2>&1; then + log "$1" "$2" + else + echo "$2" + fi +} + +if [ -n "${ASSUME_ROLE_ARN:-}" ]; then + _ar_log info " 🔑 Assuming role: $ASSUME_ROLE_ARN" + + ASSUMED_CREDS=$(aws sts assume-role \ + --role-arn "$ASSUME_ROLE_ARN" \ + --role-session-name "np-lambda-${SCOPE_ID:-workflow}" \ + --output json) + + export AWS_ACCESS_KEY_ID=$(echo "$ASSUMED_CREDS" | jq -r '.Credentials.AccessKeyId') + export AWS_SECRET_ACCESS_KEY=$(echo "$ASSUMED_CREDS" | jq -r '.Credentials.SecretAccessKey') + export AWS_SESSION_TOKEN=$(echo "$ASSUMED_CREDS" | jq -r '.Credentials.SessionToken') + + _ar_log info " ✅ Role assumed successfully" +else + _ar_log debug " ✅ assume_role=skipped (using pod credentials)" +fi diff --git a/lambda/utils/fetch_scope_configuration b/lambda/utils/fetch_scope_configuration index 63429b7..ab79244 100755 --- a/lambda/utils/fetch_scope_configuration +++ b/lambda/utils/fetch_scope_configuration @@ -76,6 +76,11 @@ log debug " ✅ placeholder_image_uri=$PLACEHOLDER_IMAGE_URI" NULL_AGENT_LAYER_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.agent.null_agent_layer_arn // empty') log debug " ✅ null_agent_layer_arn=$NULL_AGENT_LAYER_ARN" +# From scope-configurations category (optional — fallback to env var set in values.yaml) +ASSUME_ROLE_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.assume_role.arn // empty') +ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-${ASSUME_ROLE_ARN_DEFAULT:-}}" +log debug " ✅ assume_role_arn=${ASSUME_ROLE_ARN:-(not set, using pod credentials)}" + export ALB_PUBLIC_LISTENER_ARN export ALB_PRIVATE_LISTENER_ARN export VPC_ID @@ -88,5 +93,9 @@ export HOSTED_PRIVATE_ZONE_ID export TOFU_STATE_BUCKET export PLACEHOLDER_IMAGE_URI export NULL_AGENT_LAYER_ARN +export ASSUME_ROLE_ARN + +# Apply assume role immediately so all subsequent AWS calls run under the target role +source "$SERVICE_PATH/utils/assume_role" log info "✨ Scope configuration fetched successfully" diff --git a/lambda/values.yaml b/lambda/values.yaml index f8891bd..280e930 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -35,6 +35,13 @@ configuration: # ── Null Agent ───────────────────────────────────────────────────────────── USE_NULL_AGENT: false + # ── Assume Role ──────────────────────────────────────────────────────────── + # IAM role ARN to assume before any AWS operation. + # Recommended: set via the scope-configurations provider key assume_role.arn + # so it's managed per-account without changing code. + # This value is only used if the provider does not supply assume_role.arn. + ASSUME_ROLE_ARN_DEFAULT: "" + # ── IAM ──────────────────────────────────────────────────────────────────── IAM_PROPAGATION_WAIT_SECONDS: 20 From d9ff61bbf712c2c39ea3479338aef5bf1afb88c7 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 15:48:56 -0300 Subject: [PATCH 02/24] feat: add requirements module with IAM policies for Lambda scope operations Creates 4 IAM policies covering all AWS operations needed by the lambda scope: - lambda_policy: Lambda CRUD, versions, aliases, concurrency - lambda_iam_policy: execution role management (nullplatform-* and np-lambda-*) - lambda_networking_policy: API Gateway, ALB, Route53 - lambda_storage_policy: ECR, Secrets Manager, CloudWatch, S3 tfstate Co-Authored-By: Claude Sonnet 4.6 --- lambda/requirements/main.tf | 299 +++++++++++++++++++++++++++++++ lambda/requirements/output.tf | 29 +++ lambda/requirements/variables.tf | 22 +++ 3 files changed, 350 insertions(+) create mode 100644 lambda/requirements/main.tf create mode 100644 lambda/requirements/output.tf create mode 100644 lambda/requirements/variables.tf diff --git a/lambda/requirements/main.tf b/lambda/requirements/main.tf new file mode 100644 index 0000000..1939c7e --- /dev/null +++ b/lambda/requirements/main.tf @@ -0,0 +1,299 @@ +################################################################################ +# IAM role (only when create_role = true) +################################################################################ + +resource "aws_iam_role" "nullplatform_lambda_role" { + count = var.create_role ? 1 : 0 + name = "nullplatform_${var.name}_lambda_role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { AWS = var.trusted_arns } + Action = "sts:AssumeRole" + } + ] + }) +} + +################################################################################ +# Policy attachments +################################################################################ + +locals { + effective_role_name = var.create_role ? aws_iam_role.nullplatform_lambda_role[0].name : var.role_name + attach_policies = var.create_role || var.role_name != null +} + +resource "aws_iam_role_policy_attachment" "lambda" { + count = local.attach_policies ? 1 : 0 + role = local.effective_role_name + policy_arn = aws_iam_policy.nullplatform_lambda_policy.arn +} + +resource "aws_iam_role_policy_attachment" "lambda_iam" { + count = local.attach_policies ? 1 : 0 + role = local.effective_role_name + policy_arn = aws_iam_policy.nullplatform_lambda_iam_policy.arn +} + +resource "aws_iam_role_policy_attachment" "lambda_networking" { + count = local.attach_policies ? 1 : 0 + role = local.effective_role_name + policy_arn = aws_iam_policy.nullplatform_lambda_networking_policy.arn +} + +resource "aws_iam_role_policy_attachment" "lambda_storage" { + count = local.attach_policies ? 1 : 0 + role = local.effective_role_name + policy_arn = aws_iam_policy.nullplatform_lambda_storage_policy.arn +} + +################################################################################ +# Lambda core policy +# Manages Lambda functions, versions, aliases, concurrency, and invocations. +################################################################################ + +resource "aws_iam_policy" "nullplatform_lambda_policy" { + name = "nullplatform_${var.name}_lambda_policy" + description = "Policy for managing Lambda functions provisioned by the scopes-lambda provider" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "lambda:CreateFunction", + "lambda:DeleteFunction", + "lambda:GetFunction", + "lambda:GetFunctionConfiguration", + "lambda:GetFunctionConcurrency", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:PublishVersion", + "lambda:ListVersionsByFunction", + "lambda:GetAlias", + "lambda:ListAliases", + "lambda:CreateAlias", + "lambda:UpdateAlias", + "lambda:DeleteAlias", + "lambda:InvokeFunction", + "lambda:PutFunctionConcurrency", + "lambda:DeleteFunctionConcurrency", + "lambda:PutProvisionedConcurrencyConfig", + "lambda:DeleteProvisionedConcurrencyConfig", + "lambda:GetProvisionedConcurrencyConfig", + "lambda:GetAccountSettings", + "lambda:AddPermission", + "lambda:RemovePermission", + "lambda:TagResource", + "lambda:UntagResource", + "lambda:ListTags" + ] + Resource = "*" + } + ] + }) +} + +################################################################################ +# IAM management policy +# Creates and manages Lambda execution roles (scoped to nullplatform roles). +################################################################################ + +resource "aws_iam_policy" "nullplatform_lambda_iam_policy" { + name = "nullplatform_${var.name}_lambda_iam_policy" + description = "Policy for managing IAM execution roles for Lambda scopes" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "iam:CreateRole", + "iam:GetRole", + "iam:DeleteRole", + "iam:PutRolePolicy", + "iam:GetRolePolicy", + "iam:DeleteRolePolicy", + "iam:ListRolePolicies", + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + "iam:ListAttachedRolePolicies", + "iam:TagRole", + "iam:UntagRole", + "iam:PassRole" + ] + Resource = [ + "arn:aws:iam::*:role/nullplatform-*", + "arn:aws:iam::*:role/np-lambda-*" + ] + }, + { + Effect = "Allow" + Action = ["sts:GetCallerIdentity"] + Resource = "*" + } + ] + }) +} + +################################################################################ +# Networking policy +# API Gateway (HTTP APIs), ALB (target groups + listener rules), Route53 DNS. +################################################################################ + +resource "aws_iam_policy" "nullplatform_lambda_networking_policy" { + name = "nullplatform_${var.name}_lambda_networking_policy" + description = "Policy for managing API Gateway, ALB, and Route53 for Lambda scopes" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "apigateway:GET", + "apigateway:POST", + "apigateway:PUT", + "apigateway:PATCH", + "apigateway:DELETE", + "apigateway:TagResource", + "apigateway:UntagResource" + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = [ + "elasticloadbalancing:CreateTargetGroup", + "elasticloadbalancing:DeleteTargetGroup", + "elasticloadbalancing:ModifyTargetGroup", + "elasticloadbalancing:ModifyTargetGroupAttributes", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:DescribeTargetGroupAttributes", + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:DeregisterTargets", + "elasticloadbalancing:DescribeTargetHealth", + "elasticloadbalancing:CreateListenerRule", + "elasticloadbalancing:DeleteListenerRule", + "elasticloadbalancing:ModifyListenerRule", + "elasticloadbalancing:DescribeRules", + "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:AddTags", + "elasticloadbalancing:RemoveTags" + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = [ + "route53:ChangeResourceRecordSets", + "route53:GetHostedZone", + "route53:ListResourceRecordSets", + "route53:ListHostedZones" + ] + Resource = "*" + } + ] + }) +} + +################################################################################ +# Storage & Observability policy +# ECR (placeholder image), Secrets Manager (deployment parameters), +# CloudWatch Logs & Metrics, S3 (tfstate bucket). +################################################################################ + +resource "aws_iam_policy" "nullplatform_lambda_storage_policy" { + name = "nullplatform_${var.name}_lambda_storage_policy" + description = "Policy for ECR, Secrets Manager, CloudWatch, and S3 tfstate for Lambda scopes" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "ECR" + Effect = "Allow" + Action = [ + "ecr:GetAuthorizationToken", + "ecr:CreateRepository", + "ecr:DescribeRepositories", + "ecr:DescribeImages", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer", + "ecr:InitiateLayerUpload", + "ecr:UploadLayerPart", + "ecr:CompleteLayerUpload", + "ecr:PutImage", + "ecr:BatchCheckLayerAvailability", + "ecr:TagResource" + ] + Resource = "*" + }, + { + Sid = "SecretsManager" + Effect = "Allow" + Action = [ + "secretsmanager:CreateSecret", + "secretsmanager:PutSecretValue", + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret", + "secretsmanager:ListSecrets", + "secretsmanager:DeleteSecret", + "secretsmanager:TagResource" + ] + Resource = "arn:aws:secretsmanager:*:*:secret:nullplatform/*" + }, + { + Sid = "CloudWatchLogs" + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:DeleteLogGroup", + "logs:DescribeLogGroups", + "logs:CreateLogStream", + "logs:PutLogEvents", + "logs:FilterLogEvents", + "logs:GetLogEvents", + "logs:PutRetentionPolicy", + "logs:TagLogGroup" + ] + Resource = "*" + }, + { + Sid = "CloudWatchMetrics" + Effect = "Allow" + Action = [ + "cloudwatch:GetMetricStatistics", + "cloudwatch:ListMetrics", + "cloudwatch:GetMetricData" + ] + Resource = "*" + }, + { + Sid = "S3Tfstate" + Effect = "Allow" + Action = [ + "s3:CreateBucket", + "s3:HeadBucket", + "s3:PutBucketVersioning", + "s3:ListBucket", + "s3:ListBucketVersions", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:DeleteObjectVersion" + ] + Resource = [ + "arn:aws:s3:::nullplatform-lambda-tfstate-*", + "arn:aws:s3:::nullplatform-lambda-tfstate-*/*" + ] + } + ] + }) +} diff --git a/lambda/requirements/output.tf b/lambda/requirements/output.tf new file mode 100644 index 0000000..a3570a3 --- /dev/null +++ b/lambda/requirements/output.tf @@ -0,0 +1,29 @@ +output "lambda_policy_arn" { + description = "ARN of the Lambda core management policy" + value = aws_iam_policy.nullplatform_lambda_policy.arn +} + +output "lambda_iam_policy_arn" { + description = "ARN of the IAM execution role management policy" + value = aws_iam_policy.nullplatform_lambda_iam_policy.arn +} + +output "lambda_networking_policy_arn" { + description = "ARN of the networking policy (API GW + ALB + Route53)" + value = aws_iam_policy.nullplatform_lambda_networking_policy.arn +} + +output "lambda_storage_policy_arn" { + description = "ARN of the storage & observability policy (ECR + SM + CW + S3)" + value = aws_iam_policy.nullplatform_lambda_storage_policy.arn +} + +output "role_arn" { + description = "ARN of the IAM role created by this module. Empty string when create_role is false." + value = var.create_role ? aws_iam_role.nullplatform_lambda_role[0].arn : "" +} + +output "role_name" { + description = "Name of the IAM role created by this module. Empty string when create_role is false." + value = var.create_role ? aws_iam_role.nullplatform_lambda_role[0].name : "" +} diff --git a/lambda/requirements/variables.tf b/lambda/requirements/variables.tf new file mode 100644 index 0000000..d8b8298 --- /dev/null +++ b/lambda/requirements/variables.tf @@ -0,0 +1,22 @@ +variable "name" { + description = "Unique identifier for policy naming. Must be unique per AWS account (IAM policy names are account-global). Example: \"prod-us-east-1\"." + type = string +} + +variable "create_role" { + description = "When true, creates a new IAM role and attaches all policies to it. The role will allow the ARNs in trusted_arns to assume it via sts:AssumeRole." + type = bool + default = false +} + +variable "role_name" { + description = "Existing IAM role name to attach the Lambda policies to. Ignored when create_role is true." + type = string + default = null +} + +variable "trusted_arns" { + description = "List of IAM principal ARNs allowed to assume the role. Only used when create_role is true." + type = list(string) + default = [] +} From 52cba87e3ced810003281e3b64b09bd3c265c0d5 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 15:51:00 -0300 Subject: [PATCH 03/24] chore: set ASSUME_ROLE_ARN_DEFAULT for testing Co-Authored-By: Claude Sonnet 4.6 --- lambda/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda/values.yaml b/lambda/values.yaml index 280e930..629f30e 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -40,7 +40,7 @@ configuration: # Recommended: set via the scope-configurations provider key assume_role.arn # so it's managed per-account without changing code. # This value is only used if the provider does not supply assume_role.arn. - ASSUME_ROLE_ARN_DEFAULT: "" + ASSUME_ROLE_ARN_DEFAULT: "arn:aws:iam::235494813897:role/nullplatform_aws-services-cluster_lambda_role" # ── IAM ──────────────────────────────────────────────────────────────────── IAM_PROPAGATION_WAIT_SECONDS: 20 From d49d2b983a2db8bd980a92173a2fd865316ee6a1 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 15:56:34 -0300 Subject: [PATCH 04/24] fix: correct nullplatform provider version constraint in specs/tofu Co-Authored-By: Claude Sonnet 4.6 --- lambda/specs/tofu/provider.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda/specs/tofu/provider.tf b/lambda/specs/tofu/provider.tf index ef613db..51d4024 100644 --- a/lambda/specs/tofu/provider.tf +++ b/lambda/specs/tofu/provider.tf @@ -2,7 +2,7 @@ terraform { required_providers { nullplatform = { source = "nullplatform/nullplatform" - version = "0.0.87, < 0.1.0" + version = ">= 0.0.90, < 0.1.0" } http = { source = "hashicorp/http" From 33842ec3194eee25162bddc5669cd0035a18bcd7 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 16:20:09 -0300 Subject: [PATCH 05/24] fix: surface sts:AssumeRole errors to stdout for visibility in NP logs Co-Authored-By: Claude Sonnet 4.6 --- lambda/utils/assume_role | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lambda/utils/assume_role b/lambda/utils/assume_role index 66cfe16..0c851de 100755 --- a/lambda/utils/assume_role +++ b/lambda/utils/assume_role @@ -19,10 +19,17 @@ _ar_log() { if [ -n "${ASSUME_ROLE_ARN:-}" ]; then _ar_log info " 🔑 Assuming role: $ASSUME_ROLE_ARN" - ASSUMED_CREDS=$(aws sts assume-role \ + _ar_sts_error=$(mktemp) + if ! ASSUMED_CREDS=$(aws sts assume-role \ --role-arn "$ASSUME_ROLE_ARN" \ --role-session-name "np-lambda-${SCOPE_ID:-workflow}" \ - --output json) + --output json 2>"$_ar_sts_error"); then + _ar_log info "ERROR: sts:AssumeRole failed for $ASSUME_ROLE_ARN" + _ar_log info "$(cat "$_ar_sts_error")" + rm -f "$_ar_sts_error" + return 1 + fi + rm -f "$_ar_sts_error" export AWS_ACCESS_KEY_ID=$(echo "$ASSUMED_CREDS" | jq -r '.Credentials.AccessKeyId') export AWS_SECRET_ACCESS_KEY=$(echo "$ASSUMED_CREDS" | jq -r '.Credentials.SecretAccessKey') From 14d3ad54b4ee082c44037b4b1f746f0b45c35705 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 16:43:05 -0300 Subject: [PATCH 06/24] fix: use exact PLACEHOLDER_IMAGE_URI when explicitly set, skip arch suffix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When PLACEHOLDER_IMAGE_URI is set in values.yaml the operator has already chosen the exact tag — no architecture suffix should be appended. Sets the default to :latest (no arch suffix) for this deployment. Co-Authored-By: Claude Sonnet 4.6 --- lambda/scope/scripts/resolve_placeholder_image | 10 ++++++++-- lambda/values.yaml | 5 +++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lambda/scope/scripts/resolve_placeholder_image b/lambda/scope/scripts/resolve_placeholder_image index ca6a1a1..c103869 100755 --- a/lambda/scope/scripts/resolve_placeholder_image +++ b/lambda/scope/scripts/resolve_placeholder_image @@ -35,14 +35,20 @@ fi # ── Image placeholder path ──────────────────────────────────────────────────── log info "🔍 Resolving placeholder image URI..." -placeholder_image_base="${PLACEHOLDER_IMAGE_URI:-public.ecr.aws/nullplatform/aws-lambda/nullplatform-lambda-placeholder:latest}" +default_image_base="public.ecr.aws/nullplatform/aws-lambda/nullplatform-lambda-placeholder:latest" +placeholder_image_base="${PLACEHOLDER_IMAGE_URI:-$default_image_base}" architecture="${ARCHITECTURE:-arm64}" # Lambda uses "x86_64" but images are tagged with Docker convention "amd64" arch_tag="${architecture}" [ "$architecture" = "x86_64" ] && arch_tag="amd64" -if [[ "$placeholder_image_base" == *":"* ]]; then +# Only append the architecture suffix when using the default image. +# If PLACEHOLDER_IMAGE_URI is explicitly set, use it as-is — the operator +# already chose the exact tag they want. +if [ -n "${PLACEHOLDER_IMAGE_URI:-}" ]; then + placeholder_image_uri="$placeholder_image_base" +elif [[ "$placeholder_image_base" == *":"* ]]; then placeholder_image_uri="${placeholder_image_base}-${arch_tag}" else placeholder_image_uri="${placeholder_image_base}:latest-${arch_tag}" diff --git a/lambda/values.yaml b/lambda/values.yaml index 629f30e..026184f 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -35,6 +35,11 @@ configuration: # ── Null Agent ───────────────────────────────────────────────────────────── USE_NULL_AGENT: false + # ── Placeholder image ────────────────────────────────────────────────────── + # Set to the exact image URI to use as-is (no architecture suffix appended). + # Leave empty to let the script auto-select latest-arm64 / latest-amd64. + PLACEHOLDER_IMAGE_URI: "public.ecr.aws/nullplatform/aws-lambda/nullplatform-lambda-placeholder:latest" + # ── Assume Role ──────────────────────────────────────────────────────────── # IAM role ARN to assume before any AWS operation. # Recommended: set via the scope-configurations provider key assume_role.arn From fc8bc760f83316b69a07fb2eb3c32feb91da02c5 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 16:54:38 -0300 Subject: [PATCH 07/24] fix: remove automatic arch suffix from placeholder image URI The public ECR image only exists as :latest without architecture-specific tags. Remove the -arm64/-amd64 append logic from the default path. Users who publish arch-specific images can set PLACEHOLDER_IMAGE_URI explicitly to the full tag they need. Co-Authored-By: Claude Sonnet 4.6 --- .../scope/scripts/resolve_placeholder_image | 19 +++++++------------ lambda/values.yaml | 5 ----- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/lambda/scope/scripts/resolve_placeholder_image b/lambda/scope/scripts/resolve_placeholder_image index c103869..c196631 100755 --- a/lambda/scope/scripts/resolve_placeholder_image +++ b/lambda/scope/scripts/resolve_placeholder_image @@ -35,23 +35,18 @@ fi # ── Image placeholder path ──────────────────────────────────────────────────── log info "🔍 Resolving placeholder image URI..." -default_image_base="public.ecr.aws/nullplatform/aws-lambda/nullplatform-lambda-placeholder:latest" -placeholder_image_base="${PLACEHOLDER_IMAGE_URI:-$default_image_base}" +placeholder_image_base="${PLACEHOLDER_IMAGE_URI:-public.ecr.aws/nullplatform/aws-lambda/nullplatform-lambda-placeholder:latest}" architecture="${ARCHITECTURE:-arm64}" -# Lambda uses "x86_64" but images are tagged with Docker convention "amd64" -arch_tag="${architecture}" -[ "$architecture" = "x86_64" ] && arch_tag="amd64" +log debug " 📋 architecture=$architecture" -# Only append the architecture suffix when using the default image. -# If PLACEHOLDER_IMAGE_URI is explicitly set, use it as-is — the operator -# already chose the exact tag they want. -if [ -n "${PLACEHOLDER_IMAGE_URI:-}" ]; then +# Use the image URI as-is. If PLACEHOLDER_IMAGE_URI is not set, the default +# :latest tag is used without any architecture suffix — publish arch-specific +# tags and set PLACEHOLDER_IMAGE_URI explicitly if needed. +if [[ "$placeholder_image_base" == *":"* ]]; then placeholder_image_uri="$placeholder_image_base" -elif [[ "$placeholder_image_base" == *":"* ]]; then - placeholder_image_uri="${placeholder_image_base}-${arch_tag}" else - placeholder_image_uri="${placeholder_image_base}:latest-${arch_tag}" + placeholder_image_uri="${placeholder_image_base}:latest" fi log debug " 📋 architecture=$architecture" diff --git a/lambda/values.yaml b/lambda/values.yaml index 026184f..629f30e 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -35,11 +35,6 @@ configuration: # ── Null Agent ───────────────────────────────────────────────────────────── USE_NULL_AGENT: false - # ── Placeholder image ────────────────────────────────────────────────────── - # Set to the exact image URI to use as-is (no architecture suffix appended). - # Leave empty to let the script auto-select latest-arm64 / latest-amd64. - PLACEHOLDER_IMAGE_URI: "public.ecr.aws/nullplatform/aws-lambda/nullplatform-lambda-placeholder:latest" - # ── Assume Role ──────────────────────────────────────────────────────────── # IAM role ARN to assume before any AWS operation. # Recommended: set via the scope-configurations provider key assume_role.arn From 3f89288f8b0246beb6011658249c6ef65e904d2a Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 1 Jun 2026 17:42:56 -0300 Subject: [PATCH 08/24] fix: read TOFU_STATE_BUCKET from .provider.aws_state_bucket as fallback The existing scope-configurations provider in this account uses a different schema (.provider.aws_state_bucket) than our Lambda spec (.state.tofu_state_bucket). Add fallback to support both schemas without requiring a new provider instance. Co-Authored-By: Claude Sonnet 4.6 --- lambda/utils/fetch_scope_configuration | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda/utils/fetch_scope_configuration b/lambda/utils/fetch_scope_configuration index ab79244..88f447a 100755 --- a/lambda/utils/fetch_scope_configuration +++ b/lambda/utils/fetch_scope_configuration @@ -67,7 +67,7 @@ HOSTED_PRIVATE_ZONE_ID=$(echo "$CLOUD_PROVIDER_CONFIG" | jq -r '.networking.host log debug " ✅ hosted_private_zone_id=$HOSTED_PRIVATE_ZONE_ID" # From scope-configurations category -TOFU_STATE_BUCKET=$(echo "$SCOPE_CONFIG" | jq -r '.state.tofu_state_bucket // empty') +TOFU_STATE_BUCKET=$(echo "$SCOPE_CONFIG" | jq -r '.state.tofu_state_bucket // .provider.aws_state_bucket // empty') log debug " ✅ tofu_state_bucket=$TOFU_STATE_BUCKET" PLACEHOLDER_IMAGE_URI=$(echo "$SCOPE_CONFIG" | jq -r '.deployment.placeholder_image_uri // empty') From 97121e48ce48a57544fab74f987c213e2b61a1e3 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 2 Jun 2026 11:04:44 -0300 Subject: [PATCH 09/24] fix(iam): prefix lambda execution role with np-lambda- to match requirements policy The scope execution role was named "${function}-role", which didn't match the iam:CreateRole/PassRole Resource constraint (arn:aws:iam::*:role/np-lambda-*) in lambda/requirements, causing AccessDenied at tofu apply. Prefixing aligns the role name with the policy the assumed role already grants. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/scope/tofu/iam/setup | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lambda/scope/tofu/iam/setup b/lambda/scope/tofu/iam/setup index 803f4b9..bbc7a40 100755 --- a/lambda/scope/tofu/iam/setup +++ b/lambda/scope/tofu/iam/setup @@ -4,7 +4,9 @@ source "$SERVICE_PATH/utils/log" log info "🔍 Configuring IAM role for deployment..." -iam_role_name="${LAMBDA_FUNCTION_NAME}-role" +# Prefix with "np-lambda-" so the role name matches the iam:CreateRole/PassRole +# Resource constraint in lambda/requirements (arn:aws:iam::*:role/np-lambda-*). +iam_role_name="np-lambda-${LAMBDA_FUNCTION_NAME}-role" iam_role_name="${iam_role_name:0:64}" log debug " 📋 role_name=$iam_role_name" From 684d9f7fd5aab98a2686e55901fa62b27db9a9a1 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 2 Jun 2026 11:16:55 -0300 Subject: [PATCH 10/24] fix(tofu): surface tofu apply stderr to stdout for visibility in NP logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenTofu writes its "Error:" block to stderr, but the NP workflow executor only captures stdout — so the real failure reason (e.g. AWS AccessDenied) never showed in the logs, leaving only a generic "scope creation failed". Redirect stderr to stdout on the apply and stop sending the script's own error message to stderr. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/scope/tofu/do_tofu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lambda/scope/tofu/do_tofu b/lambda/scope/tofu/do_tofu index 30954c3..bb90822 100755 --- a/lambda/scope/tofu/do_tofu +++ b/lambda/scope/tofu/do_tofu @@ -177,13 +177,16 @@ if [ "$TOFU_ACTION" = "apply" ]; then fi # Run tofu action +# Redirect stderr to stdout: OpenTofu writes its "Error:" block to stderr, and the +# NP workflow executor only captures stdout — without this, the actual failure +# reason (e.g. an AWS AccessDenied) never reaches the NP logs. log info "📝 Running tofu $TOFU_ACTION..." tofu_exit_code=0 -tofu -chdir="$TF_WORKING_DIR" "$TOFU_ACTION" -auto-approve -var-file="$TOFU_VAR_FILE" || tofu_exit_code=$? +tofu -chdir="$TF_WORKING_DIR" "$TOFU_ACTION" -auto-approve -var-file="$TOFU_VAR_FILE" 2>&1 || tofu_exit_code=$? if [ $tofu_exit_code -ne 0 ]; then echo "" - echo "❌ Tofu $TOFU_ACTION failed with exit code $tofu_exit_code" >&2 - echo "" >&2 + echo "❌ Tofu $TOFU_ACTION failed with exit code $tofu_exit_code" + echo "" return 1 fi From b9e41d3bfdde1586a6f89cbb61e3b76b8a203ac2 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 2 Jun 2026 11:23:53 -0300 Subject: [PATCH 11/24] fix(iam): add modern CloudWatch Logs tagging actions to lambda requirements policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AWS provider (v5) reads log group tags via logs:ListTagsForResource and manages them via logs:TagResource/UntagResource — the generic resource-tagging API — but the policy only granted the deprecated logs:TagLogGroup. Creating a scope's aws_cloudwatch_log_group failed with AccessDenied on ListTagsForResource. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/requirements/main.tf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lambda/requirements/main.tf b/lambda/requirements/main.tf index 1939c7e..d34f63f 100644 --- a/lambda/requirements/main.tf +++ b/lambda/requirements/main.tf @@ -261,7 +261,10 @@ resource "aws_iam_policy" "nullplatform_lambda_storage_policy" { "logs:FilterLogEvents", "logs:GetLogEvents", "logs:PutRetentionPolicy", - "logs:TagLogGroup" + "logs:TagLogGroup", + "logs:ListTagsForResource", + "logs:TagResource", + "logs:UntagResource" ] Resource = "*" }, From bd26af424b97875ca50fbc16da8a0d06753f3ce6 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 2 Jun 2026 15:15:35 -0300 Subject: [PATCH 12/24] feat(placeholder): make placeholder image configurable via PLACEHOLDER_IMAGE_URI_DEFAULT Adds an env-var fallback for the Lambda placeholder image, mirroring the existing ASSUME_ROLE_ARN_DEFAULT pattern. Precedence: scope-config deployment.placeholder_image_uri > PLACEHOLDER_IMAGE_URI_DEFAULT (values.yaml) > script's hardcoded default. Lets operators point the placeholder at a private ECR mirror per account without a scope-configuration value or code changes. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/utils/fetch_scope_configuration | 4 +++- lambda/values.yaml | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lambda/utils/fetch_scope_configuration b/lambda/utils/fetch_scope_configuration index 88f447a..f002d7f 100755 --- a/lambda/utils/fetch_scope_configuration +++ b/lambda/utils/fetch_scope_configuration @@ -71,7 +71,9 @@ TOFU_STATE_BUCKET=$(echo "$SCOPE_CONFIG" | jq -r '.state.tofu_state_bucket // .p log debug " ✅ tofu_state_bucket=$TOFU_STATE_BUCKET" PLACEHOLDER_IMAGE_URI=$(echo "$SCOPE_CONFIG" | jq -r '.deployment.placeholder_image_uri // empty') -log debug " ✅ placeholder_image_uri=$PLACEHOLDER_IMAGE_URI" +# Fallback to env var set in values.yaml when the provider does not supply it. +PLACEHOLDER_IMAGE_URI="${PLACEHOLDER_IMAGE_URI:-${PLACEHOLDER_IMAGE_URI_DEFAULT:-}}" +log debug " ✅ placeholder_image_uri=${PLACEHOLDER_IMAGE_URI:-(not set, using script default)}" NULL_AGENT_LAYER_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.agent.null_agent_layer_arn // empty') log debug " ✅ null_agent_layer_arn=$NULL_AGENT_LAYER_ARN" diff --git a/lambda/values.yaml b/lambda/values.yaml index 629f30e..f307de9 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -35,6 +35,16 @@ configuration: # ── Null Agent ───────────────────────────────────────────────────────────── USE_NULL_AGENT: false + # ── Placeholder image ────────────────────────────────────────────────────── + # Container image used to bootstrap the Lambda function at scope creation, + # before the first real deployment. MUST live in a private ECR in the same + # account/region (Lambda rejects public.ecr.aws images), and be single-arch + # matching the scope architecture (publish :latest-amd64 / :latest-arm64). + # Recommended: set via the scope-configurations provider key + # deployment.placeholder_image_uri so it's managed per-account without code. + # This value is only used if the provider does not supply it. + PLACEHOLDER_IMAGE_URI_DEFAULT: "235494813897.dkr.ecr.us-east-1.amazonaws.com/aws-lambda/nullplatform-lambda-placeholder:latest-amd64" + # ── Assume Role ──────────────────────────────────────────────────────────── # IAM role ARN to assume before any AWS operation. # Recommended: set via the scope-configurations provider key assume_role.arn From 2dc0a3ef0836d525000998870cf106682ad01784 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 2 Jun 2026 15:32:10 -0300 Subject: [PATCH 13/24] fix(deploy): ensure Lambda pull policy on the image's ECR repo before update Container-image Lambdas require the source ECR repo to grant lambda.amazonaws.com pull access; without it update-function-code fails with "Lambda does not have permission to access the ECR image". update_function_code now sets the standard LambdaECRImageRetrievalPolicy on the image's repo (idempotent, best-effort), and the requirements role gains ecr:Get/SetRepositoryPolicy. Removes the need to set the policy by hand per application repo. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/deployment/scripts/update_function_code | 16 ++++++++++++++++ lambda/requirements/main.tf | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lambda/deployment/scripts/update_function_code b/lambda/deployment/scripts/update_function_code index bd91556..27c68e8 100755 --- a/lambda/deployment/scripts/update_function_code +++ b/lambda/deployment/scripts/update_function_code @@ -34,6 +34,22 @@ if [ "$package_type" = "Image" ]; then fi log debug " ✅ image_uri=$IMAGE_URI" + # Ensure the image's ECR repo lets the Lambda service pull it. Container-image + # Lambdas require a repository policy granting lambda.amazonaws.com; without it + # update-function-code fails with "Lambda does not have permission to access + # the ECR image". Idempotent and best-effort (cross-account repos may not be + # writable from here — Lambda would then need the policy set on the source side). + if [[ "$IMAGE_URI" == *.dkr.ecr.*.amazonaws.com/* ]]; then + ecr_region=$(echo "${IMAGE_URI%%/*}" | cut -d. -f4) + ecr_repo="${IMAGE_URI#*/}"; ecr_repo="${ecr_repo%%:*}"; ecr_repo="${ecr_repo%%@*}" + lambda_pull_policy='{"Version":"2008-10-17","Statement":[{"Sid":"LambdaECRImageRetrievalPolicy","Effect":"Allow","Principal":{"Service":"lambda.amazonaws.com"},"Action":["ecr:BatchGetImage","ecr:GetDownloadUrlForLayer"]}]}' + if aws ecr set-repository-policy --repository-name "$ecr_repo" --region "$ecr_region" --policy-text "$lambda_pull_policy" >/dev/null 2>&1; then + log debug " ✅ ensured Lambda pull policy on ECR repo $ecr_repo" + else + log warn " ⚠️ could not set Lambda pull policy on ECR repo $ecr_repo (continuing; pull may fail if not already allowed)" + fi + fi + update_output=$(aws lambda update-function-code \ --function-name "$LAMBDA_FUNCTION_NAME" \ --image-uri "$IMAGE_URI" \ diff --git a/lambda/requirements/main.tf b/lambda/requirements/main.tf index d34f63f..ca10521 100644 --- a/lambda/requirements/main.tf +++ b/lambda/requirements/main.tf @@ -231,7 +231,9 @@ resource "aws_iam_policy" "nullplatform_lambda_storage_policy" { "ecr:CompleteLayerUpload", "ecr:PutImage", "ecr:BatchCheckLayerAvailability", - "ecr:TagResource" + "ecr:TagResource", + "ecr:GetRepositoryPolicy", + "ecr:SetRepositoryPolicy" ] Resource = "*" }, From c04e9cc67aaa1f4de47fda2630417d7f51cb87db Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 2 Jun 2026 16:03:55 -0300 Subject: [PATCH 14/24] fix(deploy): add missing diagnose.yaml workflow for diagnose-deployment action The diagnose-deployment action mapped to deployment/workflows/diagnose.yaml, which did not exist, so every auto-diagnose after a failed deployment errored with "failed to read workflow file". Adds the workflow mirroring the scope diagnose flow: lean diagnose/build_context + executor over diagnose/checks. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/deployment/workflows/diagnose.yaml | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 lambda/deployment/workflows/diagnose.yaml diff --git a/lambda/deployment/workflows/diagnose.yaml b/lambda/deployment/workflows/diagnose.yaml new file mode 100644 index 0000000..e9425ea --- /dev/null +++ b/lambda/deployment/workflows/diagnose.yaml @@ -0,0 +1,31 @@ +include: + - "$SERVICE_PATH/values.yaml" +steps: + - name: build_context + type: script + file: "$SERVICE_PATH/diagnose/build_context" + output: + - name: SCOPE_ID + type: environment + - name: SCOPE_NRN + type: environment + - name: LAMBDA_FUNCTION_NAME + type: environment + - name: LAMBDA_FUNCTION_ARN + type: environment + - name: LAMBDA_ROLE_ARN + type: environment + - name: SCOPE_DOMAIN + type: environment + - name: diagnose + type: executor + before_each: + name: notify_check_running + type: script + file: "$SERVICE_PATH/diagnose/notify_check_running" + after_each: + name: notify_check_results + type: script + file: "$SERVICE_PATH/diagnose/notify_results" + folders: + - "$SERVICE_PATH/diagnose/checks" From bbd31a071ae96aaf14ec0715cbb80056e8a38658 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 2 Jun 2026 17:31:06 -0300 Subject: [PATCH 15/24] chore: remove account-specific defaults from values.yaml ASSUME_ROLE_ARN_DEFAULT and PLACEHOLDER_IMAGE_URI_DEFAULT carried a real AWS account ARN/URI committed for testing. The product repo must stay account-agnostic: both are now documented as account-specific and provided per-installation via the scope-configurations provider or the agent's extra_envs (Helm), not hardcoded here. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/values.yaml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/lambda/values.yaml b/lambda/values.yaml index f307de9..258ad89 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -40,17 +40,21 @@ configuration: # before the first real deployment. MUST live in a private ECR in the same # account/region (Lambda rejects public.ecr.aws images), and be single-arch # matching the scope architecture (publish :latest-amd64 / :latest-arm64). - # Recommended: set via the scope-configurations provider key - # deployment.placeholder_image_uri so it's managed per-account without code. - # This value is only used if the provider does not supply it. - PLACEHOLDER_IMAGE_URI_DEFAULT: "235494813897.dkr.ecr.us-east-1.amazonaws.com/aws-lambda/nullplatform-lambda-placeholder:latest-amd64" + # Resolution precedence (see utils/fetch_scope_configuration): + # scope-configurations provider key deployment.placeholder_image_uri + # > PLACEHOLDER_IMAGE_URI_DEFAULT env var (set per-account on the agent) + # > the public default in scope/scripts/resolve_placeholder_image + # Account-specific, so it is NOT set here — provide it via the scope-config + # or the agent's extra_envs (Helm) for your installation. # ── Assume Role ──────────────────────────────────────────────────────────── # IAM role ARN to assume before any AWS operation. - # Recommended: set via the scope-configurations provider key assume_role.arn - # so it's managed per-account without changing code. - # This value is only used if the provider does not supply assume_role.arn. - ASSUME_ROLE_ARN_DEFAULT: "arn:aws:iam::235494813897:role/nullplatform_aws-services-cluster_lambda_role" + # Resolution precedence (see utils/fetch_scope_configuration): + # scope-configurations provider key assume_role.arn + # > ASSUME_ROLE_ARN_DEFAULT env var (set per-account on the agent) + # Account-specific, so it is NOT set here — provide it via the scope-config + # or the agent's extra_envs (Helm) for your installation. If unset, the + # agent's own pod credentials are used. # ── IAM ──────────────────────────────────────────────────────────────────── IAM_PROPAGATION_WAIT_SECONDS: 20 From e186fa2cb9c7efaca69064c07af46f8588066158 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Wed, 3 Jun 2026 16:27:06 -0300 Subject: [PATCH 16/24] docs: explain placeholder image config and restore PLACEHOLDER_IMAGE_URI_DEFAULT knob Document why Image-based scopes need a private-ECR placeholder and how the URI is resolved (provider key > PLACEHOLDER_IMAGE_URI_DEFAULT > public default), including how to publish one and a troubleshooting entry. Also re-add PLACEHOLDER_IMAGE_URI_DEFAULT to values.yaml as a commented, account-agnostic template so operators can pick their own image, and normalize a stray real-looking account ID in a publish comment to the dummy 123456789012. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 62 ++++++++++++++++++++++++++++++++ lambda/scope/placeholder/publish | 2 +- lambda/values.yaml | 7 ++-- 3 files changed, 68 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 354c37c..d2d282c 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,67 @@ LOG_RETENTION_DAYS: 30 PARAMETERS_STRATEGY: "env" # or "secretsmanager" ``` +### Placeholder Image (Scope Bootstrap) + +When a scope is created, the Lambda function and its IAM role must exist **before** +the first real deployment — otherwise aliases, networking, and IAM have nothing to +attach to. To bootstrap this, `create-scope` provisions a throwaway **placeholder** +function that the first deployment then overwrites with the real code. + +How the placeholder is sourced depends on the scope's **package type**: + +- **Zip** — fully self-contained. A minimal handler ships pre-built and + base64-encoded in the repo (`scope/placeholder/placeholder_lambda.zip.b64`) and is + used automatically. **No configuration needed.** +- **Image** — the placeholder must be a container image, and this is where + `PLACEHOLDER_IMAGE_URI_DEFAULT` comes in. + +#### Why `PLACEHOLDER_IMAGE_URI_DEFAULT` is needed for Image scopes + +A Lambda function with `PackageType=Image` can only pull from a **private ECR +repository in the same account and region** — Lambda rejects `public.ecr.aws` +images at function-creation time. The built-in default in +`scope/scripts/resolve_placeholder_image` points at a public image +(`public.ecr.aws/nullplatform/aws-lambda/nullplatform-lambda-placeholder:latest`), +which is fine to *validate* but cannot actually back a real Lambda function. + +So for Image-based scopes you **must** mirror a placeholder into your own private +ECR and point the scope at it. The image must also be **single-arch matching the +scope architecture** (`-amd64` for `x86_64`, `-arm64` for `arm64`) — Lambda does +not accept multi-arch manifest lists. + +#### Resolution precedence + +The placeholder image URI is resolved in this order (first match wins): + +1. scope-configurations provider key `deployment.placeholder_image_uri` — per-scope, + managed without code +2. `PLACEHOLDER_IMAGE_URI_DEFAULT` env var — the **account-wide** knob, set in + `values.yaml` or via the agent's `extra_envs` (Helm) +3. the public default in `scope/scripts/resolve_placeholder_image` (validation-only + fallback; not usable for real Image functions) + +Because the URI is account-specific, `values.yaml` ships it commented out — set it +once per installation and every Image scope in that account uses it, unless a +specific scope overrides it via the provider key. + +#### Publishing a placeholder image + +Use the helper script to build and push the single-arch placeholders to your private +ECR (it creates the repository if it does not exist): + +```bash +export PLACEHOLDER_IMAGE_REPO=123456789012.dkr.ecr.us-east-1.amazonaws.com/aws-lambda/nullplatform-lambda-placeholder +lambda/scope/placeholder/publish # pushes :latest-arm64 and :latest-amd64 +``` + +Then set the URI (matching your scope architecture) in `values.yaml` or the agent's +`extra_envs`: + +```yaml +PLACEHOLDER_IMAGE_URI_DEFAULT: "123456789012.dkr.ecr.us-east-1.amazonaws.com/aws-lambda/nullplatform-lambda-placeholder:latest-arm64" +``` + ### Resource Naming | Resource | Format | Example | @@ -507,6 +568,7 @@ export TOFU_LOCK_TABLE=my-lock-table | Issue | Cause | Solution | |-------|-------|----------| | "Function name too long" | Name exceeds 64 chars | Shorten namespace/application/scope slugs | +| "Placeholder image not found" | Image scope with no private placeholder published | Run `lambda/scope/placeholder/publish` and set `PLACEHOLDER_IMAGE_URI_DEFAULT` (see [Placeholder Image](#placeholder-image-scope-bootstrap)) | | "Provisioned concurrency timeout" | Warmup taking too long | Increase `PROVISIONED_CONCURRENCY_MAX_WAIT_SECONDS` | | "ALB listener rule capacity" | Too many rules on ALB | Increase `ALB_LISTENER_RULE_CAPACITY` in values.yaml | | "Module not composed" | `MODULES_TO_USE` not updated | Verify setup script appends to `MODULES_TO_USE` | diff --git a/lambda/scope/placeholder/publish b/lambda/scope/placeholder/publish index db5f60d..98de7a7 100755 --- a/lambda/scope/placeholder/publish +++ b/lambda/scope/placeholder/publish @@ -51,7 +51,7 @@ if ! docker buildx version &>/dev/null; then fi # Extract registry host and region from IMAGE_REPO -ECR_REGISTRY=$(echo "$IMAGE_REPO" | cut -d/ -f1) # 688720756067.dkr.ecr.us-east-1.amazonaws.com +ECR_REGISTRY=$(echo "$IMAGE_REPO" | cut -d/ -f1) # 123456789012.dkr.ecr.us-east-1.amazonaws.com ECR_REGION=$(echo "$ECR_REGISTRY" | cut -d. -f4) # us-east-1 ECR_REPO_NAME=$(echo "$IMAGE_REPO" | cut -d/ -f2-) # aws-lambda/nullplatform-lambda-placeholder diff --git a/lambda/values.yaml b/lambda/values.yaml index 258ad89..3efea6f 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -44,8 +44,11 @@ configuration: # scope-configurations provider key deployment.placeholder_image_uri # > PLACEHOLDER_IMAGE_URI_DEFAULT env var (set per-account on the agent) # > the public default in scope/scripts/resolve_placeholder_image - # Account-specific, so it is NOT set here — provide it via the scope-config - # or the agent's extra_envs (Helm) for your installation. + # The URI is account-specific, so no real value is committed here. To choose + # your own placeholder image per installation, uncomment the line below and + # point it at your private ECR (or instead set it via the scope-config or the + # agent's extra_envs in Helm). + # PLACEHOLDER_IMAGE_URI_DEFAULT: ".dkr.ecr..amazonaws.com/aws-lambda/nullplatform-lambda-placeholder:latest-amd64" # ── Assume Role ──────────────────────────────────────────────────────────── # IAM role ARN to assume before any AWS operation. From 41ed94486beecf3f77f11584a2a62bfdc8cbb381 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Thu, 4 Jun 2026 11:09:41 -0300 Subject: [PATCH 17/24] refactor(setup): consolidate install tofu under lambda/setup and merge requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer feedback: the standalone requirements/ folder should not sit at the lambda/ root — all installation-time tofu should live together under a setup module. - Move lambda/specs/tofu/ -> lambda/setup/ (the operator-applied install module). - Merge lambda/requirements/ into lambda/setup/ (requirements.tf + outputs.tf, and its variables folded into setup/variables.tf); remove the requirements/ folder. - A single 'tofu apply' in lambda/setup now registers the scope type AND provisions the IAM policies. The 4 policies are always created; attaching them stays optional via create_role / role_name. - Add the aws provider (~> 5.0) + provider block to setup/provider.tf and a nullable aws_region var (IAM is global). 'name' is now a required setup variable. - Update backend key to lambda/setup/terraform.tfstate. - Refresh references: installation.md (cd path + IAM vars table), prerequisites.md (setup/main.tf), and the iam/setup comment. Verified with 'tofu validate' (Success). Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/installation.md | 11 ++++-- lambda/prerequisites.md | 4 +-- lambda/requirements/variables.tf | 22 ------------ lambda/scope/tofu/iam/setup | 2 +- lambda/setup/backend.tf | 5 +++ lambda/{specs/tofu => setup}/main.tf | 0 .../output.tf => setup/outputs.tf} | 0 lambda/{specs/tofu => setup}/provider.tf | 8 +++++ .../main.tf => setup/requirements.tf} | 0 .../tofu => setup}/terraform.tfvars.example | 17 +++++++++ lambda/{specs/tofu => setup}/variables.tf | 35 +++++++++++++++++++ lambda/specs/tofu/backend.tf | 5 --- 12 files changed, 77 insertions(+), 32 deletions(-) delete mode 100644 lambda/requirements/variables.tf create mode 100644 lambda/setup/backend.tf rename lambda/{specs/tofu => setup}/main.tf (100%) rename lambda/{requirements/output.tf => setup/outputs.tf} (100%) rename lambda/{specs/tofu => setup}/provider.tf (78%) rename lambda/{requirements/main.tf => setup/requirements.tf} (100%) rename lambda/{specs/tofu => setup}/terraform.tfvars.example (72%) rename lambda/{specs/tofu => setup}/variables.tf (68%) delete mode 100644 lambda/specs/tofu/backend.tf diff --git a/lambda/installation.md b/lambda/installation.md index d2d3a08..25dd240 100644 --- a/lambda/installation.md +++ b/lambda/installation.md @@ -29,17 +29,24 @@ git clone https://github.com/nullplatform/tofu-modules /root/.np/nullplatform/to ### 2. Configure variables ```bash -cd lambda/tofu +cd lambda/setup cp terraform.tfvars.example terraform.tfvars ``` -Edit `terraform.tfvars` with your values: +This module registers the scope type **and** provisions the IAM policies the +agent needs to operate Lambda scopes (formerly the separate `requirements` +module — now consolidated here). Edit `terraform.tfvars` with your values: | Variable | Required | Description | |---|---|---| | `nrn` | ✅ | Nullplatform Resource Name (`organization:account`) | | `np_api_key` | ✅ | Nullplatform API key | | `tags_selectors` | ✅ | Tags to select the agent (e.g. `{ environment = "production" }`) | +| `name` | ✅ | Unique identifier for IAM policy naming (account-global, e.g. `prod-us-east-1`) | +| `aws_region` | — | AWS provider region. IAM is global; leave unset to resolve from the environment | +| `create_role` | — | `true` to create a new IAM role and attach the Lambda policies to it | +| `trusted_arns` | — | Principal ARNs allowed to assume the created role (with `create_role = true`) | +| `role_name` | — | Existing IAM role to attach the Lambda policies to (instead of `create_role`) | | `github_branch` | — | Branch to fetch specs from (default: `main`) | | `repo_path` | — | Path where scopes-lambda is cloned on the agent | | `overrides_enabled` | — | Set `true` to enable config overrides from scopes-networking | diff --git a/lambda/prerequisites.md b/lambda/prerequisites.md index 9e216f4..8663155 100644 --- a/lambda/prerequisites.md +++ b/lambda/prerequisites.md @@ -229,7 +229,7 @@ Agents run in a Kubernetes pod and authenticate to AWS via a **Service Account** The IAM policies above let the agent CREATE Lambda functions and target groups, but the `create-scope` workflow ALSO depends on three runtime artifacts that must exist BEFORE the first scope is created. None are -auto-created by the bundled `install/tofu/main.tf` today — the operator +auto-created by the bundled `setup/main.tf` today — the operator must provision them. ### 1. Placeholder image (private ECR) @@ -383,7 +383,7 @@ This applies to **every** ECR repository that ever stores a Lambda image: 1. The placeholder ECR (created during installation, addressed by - `lambda/tofu/main.tf` if you use the bundled module — the policy is + `lambda/setup/main.tf` if you use the bundled module — the policy is already applied there). 2. **The per-application ECR repositories** that `np asset push` creates dynamically when each app does its first build, named diff --git a/lambda/requirements/variables.tf b/lambda/requirements/variables.tf deleted file mode 100644 index d8b8298..0000000 --- a/lambda/requirements/variables.tf +++ /dev/null @@ -1,22 +0,0 @@ -variable "name" { - description = "Unique identifier for policy naming. Must be unique per AWS account (IAM policy names are account-global). Example: \"prod-us-east-1\"." - type = string -} - -variable "create_role" { - description = "When true, creates a new IAM role and attaches all policies to it. The role will allow the ARNs in trusted_arns to assume it via sts:AssumeRole." - type = bool - default = false -} - -variable "role_name" { - description = "Existing IAM role name to attach the Lambda policies to. Ignored when create_role is true." - type = string - default = null -} - -variable "trusted_arns" { - description = "List of IAM principal ARNs allowed to assume the role. Only used when create_role is true." - type = list(string) - default = [] -} diff --git a/lambda/scope/tofu/iam/setup b/lambda/scope/tofu/iam/setup index bbc7a40..09fa234 100755 --- a/lambda/scope/tofu/iam/setup +++ b/lambda/scope/tofu/iam/setup @@ -5,7 +5,7 @@ source "$SERVICE_PATH/utils/log" log info "🔍 Configuring IAM role for deployment..." # Prefix with "np-lambda-" so the role name matches the iam:CreateRole/PassRole -# Resource constraint in lambda/requirements (arn:aws:iam::*:role/np-lambda-*). +# Resource constraint in lambda/setup (arn:aws:iam::*:role/np-lambda-*). iam_role_name="np-lambda-${LAMBDA_FUNCTION_NAME}-role" iam_role_name="${iam_role_name:0:64}" diff --git a/lambda/setup/backend.tf b/lambda/setup/backend.tf new file mode 100644 index 0000000..a63cc72 --- /dev/null +++ b/lambda/setup/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "s3" { + key = "lambda/setup/terraform.tfstate" + } +} diff --git a/lambda/specs/tofu/main.tf b/lambda/setup/main.tf similarity index 100% rename from lambda/specs/tofu/main.tf rename to lambda/setup/main.tf diff --git a/lambda/requirements/output.tf b/lambda/setup/outputs.tf similarity index 100% rename from lambda/requirements/output.tf rename to lambda/setup/outputs.tf diff --git a/lambda/specs/tofu/provider.tf b/lambda/setup/provider.tf similarity index 78% rename from lambda/specs/tofu/provider.tf rename to lambda/setup/provider.tf index 51d4024..4fa3661 100644 --- a/lambda/specs/tofu/provider.tf +++ b/lambda/setup/provider.tf @@ -16,9 +16,17 @@ terraform { source = "hashicorp/null" version = "~> 3.2" } + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } } } provider "nullplatform" { api_key = var.np_api_key } + +provider "aws" { + region = var.aws_region +} diff --git a/lambda/requirements/main.tf b/lambda/setup/requirements.tf similarity index 100% rename from lambda/requirements/main.tf rename to lambda/setup/requirements.tf diff --git a/lambda/specs/tofu/terraform.tfvars.example b/lambda/setup/terraform.tfvars.example similarity index 72% rename from lambda/specs/tofu/terraform.tfvars.example rename to lambda/setup/terraform.tfvars.example index b5951da..7861244 100644 --- a/lambda/specs/tofu/terraform.tfvars.example +++ b/lambda/setup/terraform.tfvars.example @@ -12,6 +12,23 @@ tags_selectors = { environment = "production" } +# Unique identifier for IAM policy naming (policy names are account-global). +name = "prod-us-east-1" + +################################################################################ +# IAM permissions (optional) +################################################################################ + +# AWS provider region (IAM is global; leave unset to resolve from the environment). +# aws_region = "us-east-1" + +# Attach the Lambda policies to a brand-new role (and trust the given principals)... +# create_role = true +# trusted_arns = ["arn:aws:iam::123456789012:role/my-agent-role"] + +# ...or attach them to an existing role instead: +# role_name = "my-existing-agent-role" + ################################################################################ # Repository (override if using a fork or private mirror) ################################################################################ diff --git a/lambda/specs/tofu/variables.tf b/lambda/setup/variables.tf similarity index 68% rename from lambda/specs/tofu/variables.tf rename to lambda/setup/variables.tf index 0a1c9ba..52ad529 100644 --- a/lambda/specs/tofu/variables.tf +++ b/lambda/setup/variables.tf @@ -95,3 +95,38 @@ variable "overrides_service_path" { type = string default = null } + +################################################################################ +# IAM permissions (requirements) +# Policies the agent needs to operate Lambda scopes. IAM is global, but the AWS +# provider still needs a region to initialize. +################################################################################ + +variable "aws_region" { + description = "AWS region used to initialize the AWS provider. IAM resources are global; leave null to resolve from the environment (AWS_REGION / profile)." + type = string + default = null +} + +variable "name" { + description = "Unique identifier for policy naming. Must be unique per AWS account (IAM policy names are account-global). Example: \"prod-us-east-1\"." + type = string +} + +variable "create_role" { + description = "When true, creates a new IAM role and attaches all policies to it. The role will allow the ARNs in trusted_arns to assume it via sts:AssumeRole." + type = bool + default = false +} + +variable "role_name" { + description = "Existing IAM role name to attach the Lambda policies to. Ignored when create_role is true." + type = string + default = null +} + +variable "trusted_arns" { + description = "List of IAM principal ARNs allowed to assume the role. Only used when create_role is true." + type = list(string) + default = [] +} diff --git a/lambda/specs/tofu/backend.tf b/lambda/specs/tofu/backend.tf deleted file mode 100644 index 7330a13..0000000 --- a/lambda/specs/tofu/backend.tf +++ /dev/null @@ -1,5 +0,0 @@ -terraform { - backend "s3" { - key = "lambda/install/terraform.tfstate" - } -} From 779eef99afc7b328103b3b3b665c10b64d2efe97 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 8 Jun 2026 11:59:27 -0300 Subject: [PATCH 18/24] feat(assume-role): resolve role ARN from nullplatform IAM provider by selector The agent resolves the IAM role to assume from the "AWS IAM" provider (category Identity & Access Control, spec aws-iam-configuration) declared in nullplatform, matching its arns list by the "lambda" selector. Precedence: ASSUME_ROLE_ARN env -> IAM provider -> scope-configurations assume_role.arn -> ASSUME_ROLE_ARN_DEFAULT -> pod IRSA. - assume_role_lib (new): pure arn_for_selector_from_json + provider_arn_for_selector (np provider list -> read, since list omits deep attributes). Mirrors the services-s3 mechanism. - fetch_scope_configuration: insert the provider-by-selector lookup as priority 2, deriving the account NRN from the scope NRN (strip :namespace=). - diagnose/build_context: same resolution before sourcing assume_role (it previously sourced assume_role without ever resolving an ARN). - values.yaml: document the precedence and the ASSUME_ROLE_SELECTOR override. - tests: BATS unit tests for both lib functions using the mock_np harness. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/diagnose/build_context | 13 ++ .../scope/tests/scripts/assume_role_lib.bats | 116 ++++++++++++++++++ lambda/utils/assume_role_lib | 43 +++++++ lambda/utils/fetch_scope_configuration | 36 +++++- lambda/values.yaml | 16 ++- 5 files changed, 217 insertions(+), 7 deletions(-) create mode 100644 lambda/scope/tests/scripts/assume_role_lib.bats create mode 100644 lambda/utils/assume_role_lib diff --git a/lambda/diagnose/build_context b/lambda/diagnose/build_context index f234a67..d17e244 100755 --- a/lambda/diagnose/build_context +++ b/lambda/diagnose/build_context @@ -15,6 +15,19 @@ if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then fi source "$SERVICE_PATH/utils/lambda_function_name" + +# Resolve the IAM role ARN to assume before any AWS call, mirroring +# fetch_scope_configuration: env override -> "AWS IAM" provider matched by the +# "lambda" selector (account nrn) -> ASSUME_ROLE_ARN_DEFAULT -> IRSA. +source "$SERVICE_PATH/utils/assume_role_lib" +ACCOUNT_NRN=$(echo "$SCOPE_NRN" | sed 's/:namespace=.*$//') +ASSUME_ROLE_SELECTOR="${ASSUME_ROLE_SELECTOR:-lambda}" +ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-}" +if [ -z "$ASSUME_ROLE_ARN" ] && [ -n "$ACCOUNT_NRN" ] && [ -n "$ASSUME_ROLE_SELECTOR" ]; then + ASSUME_ROLE_ARN=$(provider_arn_for_selector "$ACCOUNT_NRN" "$ASSUME_ROLE_SELECTOR") +fi +export ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-${ASSUME_ROLE_ARN_DEFAULT:-}}" + source "$SERVICE_PATH/utils/assume_role" lambda_info=$(aws lambda get-function --function-name "$LAMBDA_FUNCTION_NAME" --output json 2>/dev/null || echo "{}") diff --git a/lambda/scope/tests/scripts/assume_role_lib.bats b/lambda/scope/tests/scripts/assume_role_lib.bats new file mode 100644 index 0000000..8b7da30 --- /dev/null +++ b/lambda/scope/tests/scripts/assume_role_lib.bats @@ -0,0 +1,116 @@ +#!/usr/bin/env bats +# Unit tests for the pure resolution functions in utils/assume_role_lib. +# +# arn_for_selector_from_json is pure jq — exercised directly. +# provider_arn_for_selector orchestrates `np provider list` -> `np provider read`; +# we stub np() branching on its arguments (stateless, so it survives the +# command-substitution subshells the function uses) instead of a sequential mock. + +setup() { + TEST_DIR="$(cd "$(dirname "$BATS_TEST_FILENAME")" && pwd)" + HELPERS_DIR="$TEST_DIR/helpers" + LAMBDA_DIR="$(cd "$TEST_DIR/../../.." && pwd)" + + load "$HELPERS_DIR/test_helper.bash" + + # Stub np branching on args; FAKE_NP_MODE tweaks the `provider list` result. + np() { + local args="$*" + case "$args" in + *"provider list"*) + if [ "${FAKE_NP_MODE:-}" = "no_provider" ]; then + echo '{"results":[]}' + else + echo '{"results":[{"id":"prov-123"}]}' + fi + ;; + *"provider read"*) + echo '{"attributes":{"iam_role_arns":{"arns":[{"selector":"my-scope","arn":"arn:aws:iam::123456789012:role/test-lambda-role"}]}}}' + ;; + *) echo '{}' ;; + esac + } + export -f np + + source "$LAMBDA_DIR/utils/assume_role_lib" +} + +# --- arn_for_selector_from_json (pure) ------------------------------------- + +JSON='{"attributes":{"iam_role_arns":{"arns":[{"selector":"s3","arn":"arn:aws:iam::111:role/s3"},{"selector":"lambda","arn":"arn:aws:iam::111:role/lambda"}]}}}' + +@test "arn_for_selector_from_json: matching selector returns its arn" { + run arn_for_selector_from_json "$JSON" lambda + assert_success + [ "$output" = "arn:aws:iam::111:role/lambda" ] +} + +@test "arn_for_selector_from_json: unknown selector returns empty" { + run arn_for_selector_from_json "$JSON" ecs + assert_success + [ -z "$output" ] +} + +@test "arn_for_selector_from_json: missing arns key returns empty" { + run arn_for_selector_from_json '{"attributes":{}}' s3 + assert_success + [ -z "$output" ] +} + +@test "arn_for_selector_from_json: empty input returns empty" { + run arn_for_selector_from_json '' s3 + assert_success + [ -z "$output" ] +} + +@test "arn_for_selector_from_json: malformed json returns empty" { + run arn_for_selector_from_json 'not json' s3 + assert_success + [ -z "$output" ] +} + +@test "arn_for_selector_from_json: empty selector returns empty" { + run arn_for_selector_from_json "$JSON" '' + assert_success + [ -z "$output" ] +} + +@test "arn_for_selector_from_json: duplicate selector takes first" { + local dup='{"attributes":{"iam_role_arns":{"arns":[{"selector":"s3","arn":"first"},{"selector":"s3","arn":"second"}]}}}' + run arn_for_selector_from_json "$dup" s3 + assert_success + [ "$output" = "first" ] +} + +# --- provider_arn_for_selector (np list -> read orchestration) ------------- + +@test "provider_arn_for_selector: resolves arn for matching selector" { + run provider_arn_for_selector "organization=1:account=2" my-scope + assert_success + [ "$output" = "arn:aws:iam::123456789012:role/test-lambda-role" ] +} + +@test "provider_arn_for_selector: no provider instance returns empty" { + export FAKE_NP_MODE=no_provider + run provider_arn_for_selector "organization=1:account=2" my-scope + assert_success + [ -z "$output" ] +} + +@test "provider_arn_for_selector: selector not in provider returns empty" { + run provider_arn_for_selector "organization=1:account=2" does-not-exist + assert_success + [ -z "$output" ] +} + +@test "provider_arn_for_selector: empty nrn returns empty" { + run provider_arn_for_selector "" my-scope + assert_success + [ -z "$output" ] +} + +@test "provider_arn_for_selector: empty selector returns empty" { + run provider_arn_for_selector "organization=1:account=2" "" + assert_success + [ -z "$output" ] +} diff --git a/lambda/utils/assume_role_lib b/lambda/utils/assume_role_lib new file mode 100644 index 0000000..5cd186c --- /dev/null +++ b/lambda/utils/assume_role_lib @@ -0,0 +1,43 @@ +#!/bin/bash +# Sourceable library of PURE helpers for assume-role resolution. +# Defines functions only — NO side effects on source, so it can be unit-tested +# (see scope/tests/scripts/assume_role_lib.bats) and reused by the scripts that +# resolve the role to assume (fetch_scope_configuration, diagnose/build_context). + +# arn_for_selector_from_json +# Given the JSON returned by `np provider read --id --format json` and a +# selector string, echoes the matching IAM role ARN, or empty string if there +# is no match / the input is missing or malformed. First match wins. +arn_for_selector_from_json() { + local json="$1" selector="$2" + [ -n "$json" ] || return 0 + [ -n "$selector" ] || return 0 + printf '%s' "$json" | jq -r --arg sel "$selector" ' + [ .attributes.iam_role_arns.arns[]? + | select(.selector == $sel) + | .arn ] + | first // ""' 2>/dev/null +} + +# provider_arn_for_selector +# Looks up the "AWS IAM" provider (specification aws-iam-configuration, category +# "Identity & Access Control") at , reads it, and echoes the ARN matching +# . Empty string if no provider / no match. Requires np + jq. +# NOTE: `np provider list` does NOT return deep attributes, so we list to get the +# provider id and then `np provider read --id` to obtain the arns (same two-step +# pattern used for account.region resolution in fetch_scope_configuration). +provider_arn_for_selector() { + local nrn="$1" selector="$2" + [ -n "$nrn" ] || return 0 + [ -n "$selector" ] || return 0 + + local pid data + pid=$(np provider list --nrn "$nrn" \ + --specification_slug aws-iam-configuration \ + --format json --limit 100 2>/dev/null \ + | jq -r '[ (.results // [])[] ] | first | .id // ""' 2>/dev/null) + [ -n "$pid" ] && [ "$pid" != "null" ] || return 0 + + data=$(np provider read --id "$pid" --format json 2>/dev/null) + arn_for_selector_from_json "$data" "$selector" +} diff --git a/lambda/utils/fetch_scope_configuration b/lambda/utils/fetch_scope_configuration index f002d7f..9e467db 100755 --- a/lambda/utils/fetch_scope_configuration +++ b/lambda/utils/fetch_scope_configuration @@ -78,8 +78,40 @@ log debug " ✅ placeholder_image_uri=${PLACEHOLDER_IMAGE_URI:-(not set, using NULL_AGENT_LAYER_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.agent.null_agent_layer_arn // empty') log debug " ✅ null_agent_layer_arn=$NULL_AGENT_LAYER_ARN" -# From scope-configurations category (optional — fallback to env var set in values.yaml) -ASSUME_ROLE_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.assume_role.arn // empty') +# --- Resolve the IAM role ARN to assume ------------------------------------- +# Precedence (highest to lowest): +# 1. $ASSUME_ROLE_ARN already in the environment (explicit override). +# 2. The "AWS IAM" provider (Identity & Access Control, spec +# aws-iam-configuration) at the ACCOUNT nrn, matched by selector. The +# selector defaults to "lambda"; override with the ASSUME_ROLE_SELECTOR +# env var (values.yaml / agent extra_envs) if the provider uses another key. +# 3. scope-configurations provider key assume_role.arn (back-compat). +# 4. $ASSUME_ROLE_ARN_DEFAULT env var (per-account default on the agent). +# 5. None of the above -> empty -> use the pod's IRSA credentials directly. +# shellcheck source=assume_role_lib +source "$SERVICE_PATH/utils/assume_role_lib" + +# The IAM provider lives at the account level; derive the account nrn from the +# scope nrn by stripping everything from :namespace= onward. +ACCOUNT_NRN=$(echo "$NRN" | sed 's/:namespace=.*$//') +ASSUME_ROLE_SELECTOR="${ASSUME_ROLE_SELECTOR:-lambda}" + +ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-}" + +# 2. nullplatform IAM provider, by selector. +if [ -z "$ASSUME_ROLE_ARN" ] && [ -n "$ACCOUNT_NRN" ] && [ -n "$ASSUME_ROLE_SELECTOR" ]; then + ASSUME_ROLE_ARN=$(provider_arn_for_selector "$ACCOUNT_NRN" "$ASSUME_ROLE_SELECTOR") + if [ -n "$ASSUME_ROLE_ARN" ]; then + log debug " ✅ assume_role_arn from IAM provider (selector=$ASSUME_ROLE_SELECTOR)" + fi +fi + +# 3. scope-configurations provider key (back-compat). +if [ -z "$ASSUME_ROLE_ARN" ]; then + ASSUME_ROLE_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.assume_role.arn // empty') +fi + +# 4. Per-account default env var. ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-${ASSUME_ROLE_ARN_DEFAULT:-}}" log debug " ✅ assume_role_arn=${ASSUME_ROLE_ARN:-(not set, using pod credentials)}" diff --git a/lambda/values.yaml b/lambda/values.yaml index 3efea6f..df1c754 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -53,11 +53,17 @@ configuration: # ── Assume Role ──────────────────────────────────────────────────────────── # IAM role ARN to assume before any AWS operation. # Resolution precedence (see utils/fetch_scope_configuration): - # scope-configurations provider key assume_role.arn - # > ASSUME_ROLE_ARN_DEFAULT env var (set per-account on the agent) - # Account-specific, so it is NOT set here — provide it via the scope-config - # or the agent's extra_envs (Helm) for your installation. If unset, the - # agent's own pod credentials are used. + # 1. ASSUME_ROLE_ARN env var (explicit override) + # 2. "AWS IAM" provider (Identity & Access Control, spec aws-iam-configuration) + # at the account NRN, matched by selector. The selector defaults to + # "lambda"; override it with ASSUME_ROLE_SELECTOR if the provider's arns + # list uses a different key. + # 3. scope-configurations provider key assume_role.arn (back-compat) + # 4. ASSUME_ROLE_ARN_DEFAULT env var (set per-account on the agent) + # All account-specific, so none are set here — provide them via the IAM + # provider, the scope-config, or the agent's extra_envs (Helm). If nothing + # resolves, the agent's own pod credentials (IRSA) are used. + # ASSUME_ROLE_SELECTOR: "lambda" # ── IAM ──────────────────────────────────────────────────────────────────── IAM_PROPAGATION_WAIT_SECONDS: 20 From 4ff57c40c0877cf36c7b37f609e4b360526f30c2 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Mon, 8 Jun 2026 14:55:44 -0300 Subject: [PATCH 19/24] chore(values): set PLACEHOLDER_IMAGE_URI_DEFAULT for this installation Commit the account's private ECR placeholder image as the default, overridable per scope via the scope-config provider or per agent via Helm extra_envs. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/values.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lambda/values.yaml b/lambda/values.yaml index df1c754..179dd6b 100644 --- a/lambda/values.yaml +++ b/lambda/values.yaml @@ -44,11 +44,11 @@ configuration: # scope-configurations provider key deployment.placeholder_image_uri # > PLACEHOLDER_IMAGE_URI_DEFAULT env var (set per-account on the agent) # > the public default in scope/scripts/resolve_placeholder_image - # The URI is account-specific, so no real value is committed here. To choose - # your own placeholder image per installation, uncomment the line below and - # point it at your private ECR (or instead set it via the scope-config or the - # agent's extra_envs in Helm). - # PLACEHOLDER_IMAGE_URI_DEFAULT: ".dkr.ecr..amazonaws.com/aws-lambda/nullplatform-lambda-placeholder:latest-amd64" + # Default placeholder image for this installation. Account-specific, but + # committed here intentionally. Can still be overridden per scope via the + # scope-config provider key deployment.placeholder_image_uri, or per agent + # via extra_envs in Helm. + PLACEHOLDER_IMAGE_URI_DEFAULT: "235494813897.dkr.ecr.us-east-1.amazonaws.com/aws-lambda/nullplatform-lambda-placeholder:latest" # ── Assume Role ──────────────────────────────────────────────────────────── # IAM role ARN to assume before any AWS operation. From 5109c0ef721f5128580bc8ba545658cb380e3fa8 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 9 Jun 2026 12:19:12 -0300 Subject: [PATCH 20/24] feat(workflows): assume IAM role via dedicated first step in every workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The assumed credentials were never reaching the steps that need them: assume role ran inside build_context, but the workflow engine drops a step's exported vars unless declared as output:environment — and no workflow declared the AWS credentials. So tofu/aws steps ran with the pod's IRSA identity and failed on permissions. Fix: a dedicated `assume_role` step runs first in every AWS-touching workflow, resolves the role and assumes it, and exports AWS_ACCESS_KEY_ID/SECRET/ SESSION_TOKEN as output:environment so all later steps inherit them. - utils/assume_role_step (new): resolves NRN from CONTEXT, assumes, exports creds. - utils/assume_role_lib: add resolve_assume_role_arn (env -> IAM provider by selector -> scope-config -> DEFAULT) and scope_config_assume_role_arn. - fetch_scope_configuration, diagnose/build_context: remove the now-centralized assume-role resolution (single source of truth; avoids self-assume). - 18 workflow yamls: prepend the assume_role step with the 3 credential outputs. - assume_role_lib.bats: tests for the precedence chain. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/deployment/workflows/blue_green.yaml | 10 ++++ lambda/deployment/workflows/delete.yaml | 10 ++++ lambda/deployment/workflows/diagnose.yaml | 10 ++++ lambda/deployment/workflows/finalize.yaml | 10 ++++ lambda/deployment/workflows/initial.yaml | 10 ++++ lambda/deployment/workflows/rollback.yaml | 10 ++++ .../deployment/workflows/switch_traffic.yaml | 10 ++++ lambda/diagnose/build_context | 16 +----- lambda/instance/workflows/list.yaml | 10 ++++ lambda/log/workflows/log.yaml | 10 ++++ lambda/metric/workflows/list.yaml | 10 ++++ lambda/metric/workflows/metric.yaml | 10 ++++ .../scope/tests/scripts/assume_role_lib.bats | 56 +++++++++++++++++++ .../adjust_provisioned_concurrency.yaml | 10 ++++ .../adjust_reserved_concurrency.yaml | 10 ++++ lambda/scope/workflows/create.yaml | 10 ++++ lambda/scope/workflows/delete.yaml | 10 ++++ lambda/scope/workflows/diagnose.yaml | 10 ++++ lambda/scope/workflows/invoke.yaml | 10 ++++ lambda/scope/workflows/update.yaml | 10 ++++ lambda/utils/assume_role_lib | 35 ++++++++++++ lambda/utils/assume_role_step | 38 +++++++++++++ lambda/utils/fetch_scope_configuration | 44 ++------------- 23 files changed, 316 insertions(+), 53 deletions(-) create mode 100755 lambda/utils/assume_role_step diff --git a/lambda/deployment/workflows/blue_green.yaml b/lambda/deployment/workflows/blue_green.yaml index a73b128..c0913e4 100644 --- a/lambda/deployment/workflows/blue_green.yaml +++ b/lambda/deployment/workflows/blue_green.yaml @@ -3,6 +3,16 @@ include: configuration: DEPLOYMENT_STRATEGY: "blue_green" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/deployment/build_context" diff --git a/lambda/deployment/workflows/delete.yaml b/lambda/deployment/workflows/delete.yaml index 548c749..01b0933 100644 --- a/lambda/deployment/workflows/delete.yaml +++ b/lambda/deployment/workflows/delete.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/deployment/build_context" diff --git a/lambda/deployment/workflows/diagnose.yaml b/lambda/deployment/workflows/diagnose.yaml index e9425ea..10d6d39 100644 --- a/lambda/deployment/workflows/diagnose.yaml +++ b/lambda/deployment/workflows/diagnose.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/diagnose/build_context" diff --git a/lambda/deployment/workflows/finalize.yaml b/lambda/deployment/workflows/finalize.yaml index 2c49db9..98e32e4 100644 --- a/lambda/deployment/workflows/finalize.yaml +++ b/lambda/deployment/workflows/finalize.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/deployment/build_context" diff --git a/lambda/deployment/workflows/initial.yaml b/lambda/deployment/workflows/initial.yaml index 0b4723d..580642a 100644 --- a/lambda/deployment/workflows/initial.yaml +++ b/lambda/deployment/workflows/initial.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/deployment/build_context" diff --git a/lambda/deployment/workflows/rollback.yaml b/lambda/deployment/workflows/rollback.yaml index 49537a4..59c75db 100644 --- a/lambda/deployment/workflows/rollback.yaml +++ b/lambda/deployment/workflows/rollback.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/deployment/build_context" diff --git a/lambda/deployment/workflows/switch_traffic.yaml b/lambda/deployment/workflows/switch_traffic.yaml index b00f893..95023c5 100644 --- a/lambda/deployment/workflows/switch_traffic.yaml +++ b/lambda/deployment/workflows/switch_traffic.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/deployment/build_context" diff --git a/lambda/diagnose/build_context b/lambda/diagnose/build_context index d17e244..0120d32 100755 --- a/lambda/diagnose/build_context +++ b/lambda/diagnose/build_context @@ -16,19 +16,9 @@ fi source "$SERVICE_PATH/utils/lambda_function_name" -# Resolve the IAM role ARN to assume before any AWS call, mirroring -# fetch_scope_configuration: env override -> "AWS IAM" provider matched by the -# "lambda" selector (account nrn) -> ASSUME_ROLE_ARN_DEFAULT -> IRSA. -source "$SERVICE_PATH/utils/assume_role_lib" -ACCOUNT_NRN=$(echo "$SCOPE_NRN" | sed 's/:namespace=.*$//') -ASSUME_ROLE_SELECTOR="${ASSUME_ROLE_SELECTOR:-lambda}" -ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-}" -if [ -z "$ASSUME_ROLE_ARN" ] && [ -n "$ACCOUNT_NRN" ] && [ -n "$ASSUME_ROLE_SELECTOR" ]; then - ASSUME_ROLE_ARN=$(provider_arn_for_selector "$ACCOUNT_NRN" "$ASSUME_ROLE_SELECTOR") -fi -export ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-${ASSUME_ROLE_ARN_DEFAULT:-}}" - -source "$SERVICE_PATH/utils/assume_role" +# NOTE: The IAM role is assumed by the dedicated `assume_role` step that runs +# first in the workflow (see utils/assume_role_step); credentials are already in +# the environment here. lambda_info=$(aws lambda get-function --function-name "$LAMBDA_FUNCTION_NAME" --output json 2>/dev/null || echo "{}") LAMBDA_FUNCTION_ARN=$(echo "$lambda_info" | jq -r '.Configuration.FunctionArn // ""') diff --git a/lambda/instance/workflows/list.yaml b/lambda/instance/workflows/list.yaml index ad29d14..0efef23 100644 --- a/lambda/instance/workflows/list.yaml +++ b/lambda/instance/workflows/list.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/instance/build_context" diff --git a/lambda/log/workflows/log.yaml b/lambda/log/workflows/log.yaml index 391733d..bb48230 100644 --- a/lambda/log/workflows/log.yaml +++ b/lambda/log/workflows/log.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/log/build_context" diff --git a/lambda/metric/workflows/list.yaml b/lambda/metric/workflows/list.yaml index ecdf27e..e8c2bf6 100644 --- a/lambda/metric/workflows/list.yaml +++ b/lambda/metric/workflows/list.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: list_metrics type: script file: "$SERVICE_PATH/metric/list_metrics" diff --git a/lambda/metric/workflows/metric.yaml b/lambda/metric/workflows/metric.yaml index a3b3e61..725b76e 100644 --- a/lambda/metric/workflows/metric.yaml +++ b/lambda/metric/workflows/metric.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/metric/build_context" diff --git a/lambda/scope/tests/scripts/assume_role_lib.bats b/lambda/scope/tests/scripts/assume_role_lib.bats index 8b7da30..2428799 100644 --- a/lambda/scope/tests/scripts/assume_role_lib.bats +++ b/lambda/scope/tests/scripts/assume_role_lib.bats @@ -114,3 +114,59 @@ JSON='{"attributes":{"iam_role_arns":{"arns":[{"selector":"s3","arn":"arn:aws:ia assert_success [ -z "$output" ] } + +# --- resolve_assume_role_arn (full precedence chain) ----------------------- +# Each test defines its own stateless np stub (branches on args) so it survives +# the command-substitution subshells the resolver uses. + +@test "resolve_assume_role_arn: env override wins over everything" { + export ASSUME_ROLE_ARN="arn:env" + run resolve_assume_role_arn "organization=1:account=2" lambda + assert_success + [ "$output" = "arn:env" ] +} + +@test "resolve_assume_role_arn: IAM provider when no env override" { + np() { + case "$*" in + *"--specification_slug aws-iam-configuration"*) echo '{"results":[{"id":"iam-1"}]}' ;; + *"provider read"*) echo '{"attributes":{"iam_role_arns":{"arns":[{"selector":"lambda","arn":"arn:provider:lambda"}]}}}' ;; + *) echo '{}' ;; + esac + } + export -f np + run resolve_assume_role_arn "organization=1:account=2" lambda + assert_success + [ "$output" = "arn:provider:lambda" ] +} + +@test "resolve_assume_role_arn: scope-config fallback when provider misses" { + np() { + case "$*" in + *"--specification_slug aws-iam-configuration"*) echo '{"results":[]}' ;; + *"--categories scope-configurations"*) echo '{"results":[{"attributes":{"assume_role":{"arn":"arn:scopecfg:legacy"}}}]}' ;; + *) echo '{}' ;; + esac + } + export -f np + run resolve_assume_role_arn "organization=1:account=2" lambda + assert_success + [ "$output" = "arn:scopecfg:legacy" ] +} + +@test "resolve_assume_role_arn: ASSUME_ROLE_ARN_DEFAULT when nothing else resolves" { + np() { echo '{"results":[]}'; } + export -f np + export ASSUME_ROLE_ARN_DEFAULT="arn:default" + run resolve_assume_role_arn "organization=1:account=2" lambda + assert_success + [ "$output" = "arn:default" ] +} + +@test "resolve_assume_role_arn: empty (IRSA) when nothing resolves and no default" { + np() { echo '{"results":[]}'; } + export -f np + run resolve_assume_role_arn "organization=1:account=2" lambda + assert_success + [ -z "$output" ] +} diff --git a/lambda/scope/workflows/adjust_provisioned_concurrency.yaml b/lambda/scope/workflows/adjust_provisioned_concurrency.yaml index 93b2a53..4728283 100644 --- a/lambda/scope/workflows/adjust_provisioned_concurrency.yaml +++ b/lambda/scope/workflows/adjust_provisioned_concurrency.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/scope/build_context" diff --git a/lambda/scope/workflows/adjust_reserved_concurrency.yaml b/lambda/scope/workflows/adjust_reserved_concurrency.yaml index 5fab200..3039133 100644 --- a/lambda/scope/workflows/adjust_reserved_concurrency.yaml +++ b/lambda/scope/workflows/adjust_reserved_concurrency.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/scope/build_context" diff --git a/lambda/scope/workflows/create.yaml b/lambda/scope/workflows/create.yaml index de08b9a..8ec86c3 100644 --- a/lambda/scope/workflows/create.yaml +++ b/lambda/scope/workflows/create.yaml @@ -3,6 +3,16 @@ include: configuration: TOFU_ACTION: "apply" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/scope/build_context" diff --git a/lambda/scope/workflows/delete.yaml b/lambda/scope/workflows/delete.yaml index 2fb8faf..c5bdf87 100644 --- a/lambda/scope/workflows/delete.yaml +++ b/lambda/scope/workflows/delete.yaml @@ -3,6 +3,16 @@ include: configuration: TOFU_ACTION: "destroy" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/scope/build_context" diff --git a/lambda/scope/workflows/diagnose.yaml b/lambda/scope/workflows/diagnose.yaml index dd990df..f084fa4 100644 --- a/lambda/scope/workflows/diagnose.yaml +++ b/lambda/scope/workflows/diagnose.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/scope/build_context" diff --git a/lambda/scope/workflows/invoke.yaml b/lambda/scope/workflows/invoke.yaml index 58af584..fb9a25a 100644 --- a/lambda/scope/workflows/invoke.yaml +++ b/lambda/scope/workflows/invoke.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/scope/build_context" diff --git a/lambda/scope/workflows/update.yaml b/lambda/scope/workflows/update.yaml index f9a6bc4..9969bc8 100644 --- a/lambda/scope/workflows/update.yaml +++ b/lambda/scope/workflows/update.yaml @@ -1,6 +1,16 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: assume_role + type: script + file: "$SERVICE_PATH/utils/assume_role_step" + output: + - name: AWS_ACCESS_KEY_ID + type: environment + - name: AWS_SECRET_ACCESS_KEY + type: environment + - name: AWS_SESSION_TOKEN + type: environment - name: build_context type: script file: "$SERVICE_PATH/scope/build_context" diff --git a/lambda/utils/assume_role_lib b/lambda/utils/assume_role_lib index 5cd186c..7f2b503 100644 --- a/lambda/utils/assume_role_lib +++ b/lambda/utils/assume_role_lib @@ -41,3 +41,38 @@ provider_arn_for_selector() { data=$(np provider read --id "$pid" --format json 2>/dev/null) arn_for_selector_from_json "$data" "$selector" } + +# scope_config_assume_role_arn +# Back-compat fallback: reads the scope-configurations provider(s) at and +# echoes the first .attributes.assume_role.arn found, or empty string. This is +# the legacy per-scope override that predates the IAM provider mechanism. +scope_config_assume_role_arn() { + local nrn="$1" + [ -n "$nrn" ] || return 0 + np provider list --nrn "$nrn" --categories scope-configurations \ + --format json --limit 100 2>/dev/null \ + | jq -r '[ (.results // [])[] | .attributes.assume_role.arn? // empty ] | first // ""' 2>/dev/null +} + +# resolve_assume_role_arn +# Full precedence chain, echoes the IAM role ARN to assume (empty = use IRSA): +# 1. $ASSUME_ROLE_ARN env var (explicit override) +# 2. "AWS IAM" provider (aws-iam-configuration) at by +# 3. scope-configurations provider key assume_role.arn (back-compat) +# 4. $ASSUME_ROLE_ARN_DEFAULT env var (per-account agent default) +resolve_assume_role_arn() { + local nrn="$1" selector="$2" arn="" + + arn="${ASSUME_ROLE_ARN:-}" + + if [ -z "$arn" ] && [ -n "$nrn" ] && [ -n "$selector" ]; then + arn=$(provider_arn_for_selector "$nrn" "$selector") + fi + + if [ -z "$arn" ] && [ -n "$nrn" ]; then + arn=$(scope_config_assume_role_arn "$nrn") + fi + + arn="${arn:-${ASSUME_ROLE_ARN_DEFAULT:-}}" + printf '%s' "$arn" +} diff --git a/lambda/utils/assume_role_step b/lambda/utils/assume_role_step new file mode 100755 index 0000000..ccc86cc --- /dev/null +++ b/lambda/utils/assume_role_step @@ -0,0 +1,38 @@ +#!/bin/bash +# Dedicated workflow step: resolve the target IAM role and assume it, exporting +# temporary credentials so every subsequent step in the workflow inherits them. +# +# Runs FIRST in each AWS-touching workflow. The workflow YAML must declare +# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN as +# output:environment so the nullplatform engine propagates them downstream. +# +# Resolution precedence (see resolve_assume_role_arn in assume_role_lib): +# $ASSUME_ROLE_ARN env -> IAM provider by selector -> scope-config -> DEFAULT -> IRSA +# +# Requires: aws CLI, jq, np. Expects: CONTEXT, SERVICE_PATH. + +# Optional pretty logging (utils/assume_role falls back to echo if `log` absent). +[ -f "$SERVICE_PATH/utils/log" ] && source "$SERVICE_PATH/utils/log" +# shellcheck source=assume_role_lib +source "$SERVICE_PATH/utils/assume_role_lib" + +# Account NRN from CONTEXT (scope / service / generic event), stripping the +# :namespace= segment and everything after it — the IAM provider is account-level. +NRN=$(echo "${CONTEXT:-}" | jq -r '.scope.nrn // .service.nrn // .entity_nrn // ""' 2>/dev/null) +ACCOUNT_NRN=$(echo "$NRN" | sed 's/:namespace=.*$//') +ASSUME_ROLE_SELECTOR="${ASSUME_ROLE_SELECTOR:-lambda}" + +ASSUME_ROLE_ARN=$(resolve_assume_role_arn "$ACCOUNT_NRN" "$ASSUME_ROLE_SELECTOR") +export ASSUME_ROLE_ARN + +# utils/assume_role performs sts:AssumeRole and exports AWS_* when an ARN is set, +# or no-ops (leaving pod IRSA in place) when empty. It returns non-zero only when +# sts:AssumeRole itself fails. +if ! source "$SERVICE_PATH/utils/assume_role"; then + echo "❌ assume_role step failed: could not assume $ASSUME_ROLE_ARN" >&2 + echo "💡 Possible causes:" >&2 + echo " - The agent's pod role is not allowed to sts:AssumeRole the target role" >&2 + echo " - The target role's trust policy does not trust the agent role" >&2 + echo " - The resolved ARN is wrong (check the IAM provider selector=$ASSUME_ROLE_SELECTOR)" >&2 + exit 1 +fi diff --git a/lambda/utils/fetch_scope_configuration b/lambda/utils/fetch_scope_configuration index 9e467db..5ab0cc5 100755 --- a/lambda/utils/fetch_scope_configuration +++ b/lambda/utils/fetch_scope_configuration @@ -78,42 +78,10 @@ log debug " ✅ placeholder_image_uri=${PLACEHOLDER_IMAGE_URI:-(not set, using NULL_AGENT_LAYER_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.agent.null_agent_layer_arn // empty') log debug " ✅ null_agent_layer_arn=$NULL_AGENT_LAYER_ARN" -# --- Resolve the IAM role ARN to assume ------------------------------------- -# Precedence (highest to lowest): -# 1. $ASSUME_ROLE_ARN already in the environment (explicit override). -# 2. The "AWS IAM" provider (Identity & Access Control, spec -# aws-iam-configuration) at the ACCOUNT nrn, matched by selector. The -# selector defaults to "lambda"; override with the ASSUME_ROLE_SELECTOR -# env var (values.yaml / agent extra_envs) if the provider uses another key. -# 3. scope-configurations provider key assume_role.arn (back-compat). -# 4. $ASSUME_ROLE_ARN_DEFAULT env var (per-account default on the agent). -# 5. None of the above -> empty -> use the pod's IRSA credentials directly. -# shellcheck source=assume_role_lib -source "$SERVICE_PATH/utils/assume_role_lib" - -# The IAM provider lives at the account level; derive the account nrn from the -# scope nrn by stripping everything from :namespace= onward. -ACCOUNT_NRN=$(echo "$NRN" | sed 's/:namespace=.*$//') -ASSUME_ROLE_SELECTOR="${ASSUME_ROLE_SELECTOR:-lambda}" - -ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-}" - -# 2. nullplatform IAM provider, by selector. -if [ -z "$ASSUME_ROLE_ARN" ] && [ -n "$ACCOUNT_NRN" ] && [ -n "$ASSUME_ROLE_SELECTOR" ]; then - ASSUME_ROLE_ARN=$(provider_arn_for_selector "$ACCOUNT_NRN" "$ASSUME_ROLE_SELECTOR") - if [ -n "$ASSUME_ROLE_ARN" ]; then - log debug " ✅ assume_role_arn from IAM provider (selector=$ASSUME_ROLE_SELECTOR)" - fi -fi - -# 3. scope-configurations provider key (back-compat). -if [ -z "$ASSUME_ROLE_ARN" ]; then - ASSUME_ROLE_ARN=$(echo "$SCOPE_CONFIG" | jq -r '.assume_role.arn // empty') -fi - -# 4. Per-account default env var. -ASSUME_ROLE_ARN="${ASSUME_ROLE_ARN:-${ASSUME_ROLE_ARN_DEFAULT:-}}" -log debug " ✅ assume_role_arn=${ASSUME_ROLE_ARN:-(not set, using pod credentials)}" +# NOTE: The IAM role is assumed by the dedicated `assume_role` step that runs +# first in each workflow (see utils/assume_role_step). By the time this script +# runs, AWS_ACCESS_KEY_ID/SECRET/SESSION_TOKEN are already in the environment, so +# no assume-role resolution happens here anymore. export ALB_PUBLIC_LISTENER_ARN export ALB_PRIVATE_LISTENER_ARN @@ -127,9 +95,5 @@ export HOSTED_PRIVATE_ZONE_ID export TOFU_STATE_BUCKET export PLACEHOLDER_IMAGE_URI export NULL_AGENT_LAYER_ARN -export ASSUME_ROLE_ARN - -# Apply assume role immediately so all subsequent AWS calls run under the target role -source "$SERVICE_PATH/utils/assume_role" log info "✨ Scope configuration fetched successfully" From 23e25154a706e45f6101b23ff5561f7555e97123 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 9 Jun 2026 16:36:27 -0300 Subject: [PATCH 21/24] feat(iam): make Lambda execution-role prefix configurable The execution role name was hardcoded as np-lambda--role. Resolve the prefix via get_config_value (scope-config provider lambda.execution_role_prefix > LAMBDA_EXECUTION_ROLE_PREFIX env > default "np-lambda-"), keeping the previous name as the default so existing scopes are unaffected. Warn (non-blocking) when the prefix falls outside the assume role's IAM policy constraint (np-lambda-* / nullplatform-*), since CreateRole/PassRole would otherwise be denied unless that policy is widened. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/scope/tofu/iam/setup | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/lambda/scope/tofu/iam/setup b/lambda/scope/tofu/iam/setup index 09fa234..a113d04 100755 --- a/lambda/scope/tofu/iam/setup +++ b/lambda/scope/tofu/iam/setup @@ -1,14 +1,30 @@ #!/bin/bash source "$SERVICE_PATH/utils/log" +source "$SERVICE_PATH/utils/get_config_value" log info "🔍 Configuring IAM role for deployment..." -# Prefix with "np-lambda-" so the role name matches the iam:CreateRole/PassRole -# Resource constraint in lambda/setup (arn:aws:iam::*:role/np-lambda-*). -iam_role_name="np-lambda-${LAMBDA_FUNCTION_NAME}-role" +# Execution-role name = -role. The prefix is configurable +# (scope-config provider > LAMBDA_EXECUTION_ROLE_PREFIX env > default), but it +# MUST keep matching the iam:CreateRole/PassRole Resource constraint of the +# assume role's policy in lambda/setup (arn:aws:iam::*:role/np-lambda-* or +# .../nullplatform-*); otherwise CreateRole/PassRole are denied. The default +# preserves the historical "np-lambda-". +exec_role_prefix=$(get_config_value \ + --provider '.providers["scope-configurations"].lambda.execution_role_prefix' \ + --env LAMBDA_EXECUTION_ROLE_PREFIX \ + --default 'np-lambda-') + +iam_role_name="${exec_role_prefix}${LAMBDA_FUNCTION_NAME}-role" iam_role_name="${iam_role_name:0:64}" +# Warn (don't block) if the prefix falls outside the policy's allowed prefixes. +case "$exec_role_prefix" in + np-lambda-*|nullplatform-*) : ;; + *) log warn " ⚠️ execution_role_prefix='$exec_role_prefix' is outside the assume role's IAM policy constraint (np-lambda-* / nullplatform-*); CreateRole/PassRole may be denied unless that policy is updated" ;; +esac + log debug " 📋 role_name=$iam_role_name" role_output=$(aws iam get-role --role-name "$iam_role_name" 2>&1) From 0b684760c3f8623c6ab6bada94bef4939efce99a Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 9 Jun 2026 16:40:21 -0300 Subject: [PATCH 22/24] chore(do_tofu): remove the stderr-redirect explanation comment Drop the comment block above the tofu run; the 2>&1 redirect itself is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/scope/tofu/do_tofu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lambda/scope/tofu/do_tofu b/lambda/scope/tofu/do_tofu index bb90822..e6d94e5 100755 --- a/lambda/scope/tofu/do_tofu +++ b/lambda/scope/tofu/do_tofu @@ -176,10 +176,6 @@ if [ "$TOFU_ACTION" = "apply" ]; then fi -# Run tofu action -# Redirect stderr to stdout: OpenTofu writes its "Error:" block to stderr, and the -# NP workflow executor only captures stdout — without this, the actual failure -# reason (e.g. an AWS AccessDenied) never reaches the NP logs. log info "📝 Running tofu $TOFU_ACTION..." tofu_exit_code=0 tofu -chdir="$TF_WORKING_DIR" "$TOFU_ACTION" -auto-approve -var-file="$TOFU_VAR_FILE" 2>&1 || tofu_exit_code=$? From 51836a5c90f84e0f24e4cfaa803f2440de23c487 Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Tue, 9 Jun 2026 19:01:35 -0300 Subject: [PATCH 23/24] refactor: move install tofu module from lambda/setup to lambda/specs/tofu Relocate the standalone install module (IAM role+policies and the NP scope_definition registration) so the .tf lives next to the .json.tpl specs it consumes. No behavior change; still a standalone root module. - git mv lambda/setup -> lambda/specs/tofu (history preserved) - backend.tf key: lambda/setup/... -> lambda/specs/tofu/... - installation.md / prerequisites.md: update the paths Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/installation.md | 2 +- lambda/prerequisites.md | 4 ++-- lambda/setup/backend.tf | 5 ----- lambda/specs/tofu/backend.tf | 5 +++++ lambda/{setup => specs/tofu}/main.tf | 0 lambda/{setup => specs/tofu}/outputs.tf | 0 lambda/{setup => specs/tofu}/provider.tf | 0 lambda/{setup => specs/tofu}/requirements.tf | 0 lambda/{setup => specs/tofu}/terraform.tfvars.example | 0 lambda/{setup => specs/tofu}/variables.tf | 0 10 files changed, 8 insertions(+), 8 deletions(-) delete mode 100644 lambda/setup/backend.tf create mode 100644 lambda/specs/tofu/backend.tf rename lambda/{setup => specs/tofu}/main.tf (100%) rename lambda/{setup => specs/tofu}/outputs.tf (100%) rename lambda/{setup => specs/tofu}/provider.tf (100%) rename lambda/{setup => specs/tofu}/requirements.tf (100%) rename lambda/{setup => specs/tofu}/terraform.tfvars.example (100%) rename lambda/{setup => specs/tofu}/variables.tf (100%) diff --git a/lambda/installation.md b/lambda/installation.md index 25dd240..f34eb07 100644 --- a/lambda/installation.md +++ b/lambda/installation.md @@ -29,7 +29,7 @@ git clone https://github.com/nullplatform/tofu-modules /root/.np/nullplatform/to ### 2. Configure variables ```bash -cd lambda/setup +cd lambda/specs/tofu cp terraform.tfvars.example terraform.tfvars ``` diff --git a/lambda/prerequisites.md b/lambda/prerequisites.md index 8663155..c569754 100644 --- a/lambda/prerequisites.md +++ b/lambda/prerequisites.md @@ -229,7 +229,7 @@ Agents run in a Kubernetes pod and authenticate to AWS via a **Service Account** The IAM policies above let the agent CREATE Lambda functions and target groups, but the `create-scope` workflow ALSO depends on three runtime artifacts that must exist BEFORE the first scope is created. None are -auto-created by the bundled `setup/main.tf` today — the operator +auto-created by the bundled `specs/tofu/main.tf` today — the operator must provision them. ### 1. Placeholder image (private ECR) @@ -383,7 +383,7 @@ This applies to **every** ECR repository that ever stores a Lambda image: 1. The placeholder ECR (created during installation, addressed by - `lambda/setup/main.tf` if you use the bundled module — the policy is + `lambda/specs/tofu/main.tf` if you use the bundled module — the policy is already applied there). 2. **The per-application ECR repositories** that `np asset push` creates dynamically when each app does its first build, named diff --git a/lambda/setup/backend.tf b/lambda/setup/backend.tf deleted file mode 100644 index a63cc72..0000000 --- a/lambda/setup/backend.tf +++ /dev/null @@ -1,5 +0,0 @@ -terraform { - backend "s3" { - key = "lambda/setup/terraform.tfstate" - } -} diff --git a/lambda/specs/tofu/backend.tf b/lambda/specs/tofu/backend.tf new file mode 100644 index 0000000..cc6acba --- /dev/null +++ b/lambda/specs/tofu/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "s3" { + key = "lambda/specs/tofu/terraform.tfstate" + } +} diff --git a/lambda/setup/main.tf b/lambda/specs/tofu/main.tf similarity index 100% rename from lambda/setup/main.tf rename to lambda/specs/tofu/main.tf diff --git a/lambda/setup/outputs.tf b/lambda/specs/tofu/outputs.tf similarity index 100% rename from lambda/setup/outputs.tf rename to lambda/specs/tofu/outputs.tf diff --git a/lambda/setup/provider.tf b/lambda/specs/tofu/provider.tf similarity index 100% rename from lambda/setup/provider.tf rename to lambda/specs/tofu/provider.tf diff --git a/lambda/setup/requirements.tf b/lambda/specs/tofu/requirements.tf similarity index 100% rename from lambda/setup/requirements.tf rename to lambda/specs/tofu/requirements.tf diff --git a/lambda/setup/terraform.tfvars.example b/lambda/specs/tofu/terraform.tfvars.example similarity index 100% rename from lambda/setup/terraform.tfvars.example rename to lambda/specs/tofu/terraform.tfvars.example diff --git a/lambda/setup/variables.tf b/lambda/specs/tofu/variables.tf similarity index 100% rename from lambda/setup/variables.tf rename to lambda/specs/tofu/variables.tf From 6752beefd4ebc313c936efd46c8d3e027b4022fe Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Wed, 10 Jun 2026 12:59:05 -0300 Subject: [PATCH 24/24] fix(specs/tofu): bump aws provider constraint to ~> 6.47.0 The specs/tofu module now also creates the Lambda IAM requirements (aws resources), so its provider pin must be compatible with consumers running the AWS provider 6.x line (EKS/agent stack). ~> 5.0 made the provider graph unresolvable when composed with those modules. Co-Authored-By: Claude Opus 4.8 (1M context) --- lambda/specs/tofu/provider.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda/specs/tofu/provider.tf b/lambda/specs/tofu/provider.tf index 4fa3661..5ea18b8 100644 --- a/lambda/specs/tofu/provider.tf +++ b/lambda/specs/tofu/provider.tf @@ -18,7 +18,7 @@ terraform { } aws = { source = "hashicorp/aws" - version = "~> 5.0" + version = "~> 6.47.0" } } }