From 025de7eb23dea72c1fedaf3169256e116c2ac98f Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 12:22:34 -0300 Subject: [PATCH 01/14] feat: opt-in ALB autocreation when pool capacity is exhausted --- CHANGELOG.md | 1 + k8s/docs/autocreate-alb.md | 76 +++++ k8s/scope/networking/autocreate_alb | 213 ++++++++++++++ k8s/scope/networking/resolve_balancer | 74 ++++- k8s/scope/templates/ingress-dummy.yaml.tpl | 28 ++ .../tests/networking/autocreate_alb.bats | 262 ++++++++++++++++++ .../tests/networking/resolve_balancer.bats | 213 +++++++++++++- k8s/values.yaml | 6 + 8 files changed, 870 insertions(+), 3 deletions(-) create mode 100644 k8s/docs/autocreate-alb.md create mode 100644 k8s/scope/networking/autocreate_alb create mode 100644 k8s/scope/templates/ingress-dummy.yaml.tpl create mode 100644 k8s/scope/tests/networking/autocreate_alb.bats diff --git a/CHANGELOG.md b/CHANGELOG.md index fb7861d6..acec6fd1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Add opt-in ALB autocreation for k8s scopes: when every declared ALB is at or above `ALB_MAX_CAPACITY`, the platform provisions a new ALB via a dummy Ingress, tags it for future discovery, and uses it for the scope. Disabled by default; enable with `ALB_AUTOCREATE_ENABLED=true`. Requires additional AWS permissions: `elasticloadbalancing:AddTags`, `elasticloadbalancing:DescribeTags`, `tag:GetResources` - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage - Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports - Improve **wait deployment active** failure logging: consolidate repeated `Unhealthy` probe events per pod into a single human-readable line, emit a progress heartbeat every 10% of timeout, and surface a targeted suggested fix based on the probe failure mode (port not open / HTTP non-2xx / probe timeout) diff --git a/k8s/docs/autocreate-alb.md b/k8s/docs/autocreate-alb.md new file mode 100644 index 00000000..bac8bb3d --- /dev/null +++ b/k8s/docs/autocreate-alb.md @@ -0,0 +1,76 @@ +# ALB Autocreation + +The k8s scope can provision new Application Load Balancers (ALBs) on demand when the declared pool of ALBs is exhausted. The behavior is opt-in and only triggers during scope creation; existing scopes are never moved to autocreated ALBs automatically. + +## When the autocreate path runs + +The flow only triggers when **all** of the following are true: + +- `ALB_AUTOCREATE_ENABLED=true` in `values.yaml` or in the `container-orchestration` provider. +- `DNS_TYPE=route53` (autocreation requires the same load-balancing path used by Route53 scopes). +- Every candidate ALB in the pool (declared base + additional balancers + previously autocreated ALBs discovered by tag) reports a rule count `>= ALB_MAX_CAPACITY`. +- The scope being created does not already have a Route53 record (a scope being recreated reuses its existing ALB and does not trigger autocreation). + +If any candidate is below the threshold, the scope creation uses that candidate and the autocreate path is not taken. + +## Configuration + +| Key | Default | Description | +|---|---|---| +| `ALB_AUTOCREATE_ENABLED` | `false` | Master switch. When `false`, behavior is identical to previous releases. | +| `ALB_AUTOCREATE_NAME_PREFIX` | `nullplatform-auto-` | Prefix for autocreated ALB names. Final name format: `-<6 hex chars>`. Total length must stay below the AWS 32-character ALB name limit. | +| `ALB_AUTOCREATE_TIMEOUT_SECONDS` | `300` | How long the script polls AWS for the new ALB to reach `state=active` before failing the scope creation. The AWS Load Balancer Controller usually takes 2–4 minutes. | + +All three keys are also readable from `providers.container-orchestration.balancer.{autocreate_enabled, autocreate_name_prefix, autocreate_timeout_seconds}`. + +## How it works + +1. `resolve_balancer` evaluates the candidate pool (declared + tag-discovered ALBs) and picks the least-loaded one as today. +2. If that candidate's rule count is at or above `ALB_MAX_CAPACITY` and `ALB_AUTOCREATE_ENABLED=true`, `resolve_balancer` sources `autocreate_alb`. +3. `autocreate_alb` generates a unique ALB name, renders `scope/templates/ingress-dummy.yaml.tpl`, and applies it. The AWS Load Balancer Controller picks up the Ingress and provisions the ALB. +4. The script polls `aws elbv2 describe-load-balancers` every 10 seconds until the ALB reports `State.Code=active` (or `failed`/timeout, in which case the scope creation fails). +5. Once active, the script tags the ALB with: + - `nullplatform:managed-by=autocreate` + - `nullplatform:visibility=internet-facing|internal` + - `nullplatform:created-by-scope-id=` +6. `resolve_balancer` substitutes the new ALB name and the rest of the scope creation proceeds. + +## Discovery of previously autocreated ALBs + +Every scope creation queries `resourcegroupstaggingapi:get-resources` for ALBs tagged `nullplatform:managed-by=autocreate` matching the scope's visibility. Discovered ALBs are merged into the candidate pool without any provider configuration change, so a single autocreated ALB serves many subsequent scopes before another autocreation is needed. + +Discovery runs regardless of `ALB_AUTOCREATE_ENABLED`: even if the flag is later turned off, previously autocreated ALBs remain usable. + +## Required AWS permissions + +In addition to the permissions already required for capacity validation, the agent role needs: + +- `elasticloadbalancing:AddTags` — to tag the new ALB so discovery can find it. +- `elasticloadbalancing:DescribeTags` — for the discovery path (covered by capacity validation in most agents, listed here for completeness). +- `tag:GetResources` — for the `resourcegroupstaggingapi` call used by discovery. + +The dummy Ingress requires no new K8s permissions beyond those the agent already has for scope resources. + +## Operational notes + +- Scope creations that trigger autocreation are slower (typically 2–4 minutes extra). This is the expected behavior, not a regression. The platform logs `🔧 All candidate ALBs are at or above capacity (...); triggering autocreate` when it happens. +- The dummy Ingress (`nullplatform-autocreate-`) is created in the scope's namespace. It exposes no traffic and exists only to keep the ALB alive. Deleting it manually will cause the AWS Load Balancer Controller to delete the ALB. +- The ALB is registered through AWS tags rather than through the nullplatform provider configuration. Two consequences: + 1. The nullplatform provider object does not need to be updated by the script; this avoids requiring API credentials inside the scope workflow. + 2. The cloud's IaC (Terraform, OpenTofu, CloudFormation) is **not** updated automatically. If your IaC is the source of truth for ALB inventory, you should reconcile autocreated ALBs into it through your own process. + +## Failure modes + +| Failure | Outcome | +|---|---| +| Dummy Ingress template render fails | Scope creation exits 1 with `Failed to render ingress-dummy template`. | +| `kubectl apply` fails | Scope creation exits 1 with `Failed to apply ingress-dummy` and prints the namespace check hint. | +| ALB never reaches `active` within `ALB_AUTOCREATE_TIMEOUT_SECONDS` | Scope creation exits 1; check controller logs and AWS quota for ALBs in the region. | +| AWS reports the ALB state as `failed` | Scope creation exits 1 immediately. | +| `AddTags` call fails (no IAM permission) | Logged as `⚠️ Could not tag ALB; subsequent discovery may not find it`. The scope creation continues; the next creation will not find this ALB by tag and may autocreate another one. | + +## What is out of scope + +- Migration of existing scopes to autocreated ALBs. Use the `Recreate scope` action if needed. +- Automatic cleanup of unused autocreated ALBs (no scopes referencing them). +- Updating the cloud IaC (Terraform / OpenTofu / CloudFormation) with the new ALB. diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb new file mode 100644 index 00000000..1b6ffacb --- /dev/null +++ b/k8s/scope/networking/autocreate_alb @@ -0,0 +1,213 @@ +#!/bin/bash + +# Creates a new ALB on demand when the existing pool is exhausted. +# +# The script applies a dummy Ingress that the AWS Load Balancer Controller +# uses as the trigger to provision a new ALB. It then polls AWS until the +# ALB reaches the `active` state before returning the new ALB name to the +# caller via the AUTOCREATED_ALB_NAME output variable. +# +# Inputs (env vars): +# CONTEXT - Scope CONTEXT JSON (with .scope, .k8s_namespace, etc.) +# INGRESS_VISIBILITY - "internet-facing" or "internal" +# REGION - AWS region +# ALB_AUTOCREATE_NAME_PREFIX - Prefix for the new ALB (default: nullplatform-auto-) +# ALB_AUTOCREATE_TIMEOUT_SECONDS - Max seconds to wait for ALB active (default: 300) +# K8S_NAMESPACE - Namespace to place the dummy Ingress in +# +# Outputs (env vars): +# AUTOCREATED_ALB_NAME - Name of the ALB that was created + +_AUTOCREATE_ALB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if ! type -t log >/dev/null 2>&1; then + source "$_AUTOCREATE_ALB_DIR/../../logging" +fi + +if ! type -t get_config_value >/dev/null 2>&1; then + source "$_AUTOCREATE_ALB_DIR/../../utils/get_config_value" +fi + +# Generates a unique-enough ALB name for autocreation. +# Format: -<6-hex> +# Visibility short: "public" for internet-facing, "private" for internal. +# Total length kept under 32 chars (AWS ALB name limit). +generate_alb_name() { + local prefix="$1" + local visibility="$2" + + local vis_short + if [ "$visibility" = "internet-facing" ]; then + vis_short="public" + else + vis_short="private" + fi + + local suffix + suffix=$(LC_ALL=C tr -dc 'a-f0-9' < /dev/urandom 2>/dev/null | head -c 6) || { + suffix=$(printf '%06x' $((RANDOM * RANDOM % 0xFFFFFF))) + } + + echo "${prefix}${vis_short}-${suffix}" +} + +# Renders the dummy Ingress template and applies it to the cluster. +# Returns 0 on success, non-zero on failure (template render or kubectl apply). +apply_dummy_ingress() { + local alb_name="$1" + local visibility="$2" + local namespace="$3" + + local tmp_context + tmp_context=$(mktemp) || return 1 + local tmp_yaml + tmp_yaml=$(mktemp --suffix=.yaml 2>/dev/null) || tmp_yaml="$(mktemp).yaml" + + echo "$CONTEXT" | jq \ + --arg alb_name "$alb_name" \ + --arg ingress_visibility "$visibility" \ + --arg k8s_namespace "$namespace" \ + '. + {alb_name: $alb_name, ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace}' \ + > "$tmp_context" + + local template_path="${INGRESS_DUMMY_TEMPLATE:-$SERVICE_PATH/scope/templates/ingress-dummy.yaml.tpl}" + + if ! gomplate -c .="$tmp_context" --file "$template_path" --out "$tmp_yaml" 2>/dev/null; then + log error "❌ Failed to render ingress-dummy template" + log error "📋 Template: $template_path" + rm -f "$tmp_context" "$tmp_yaml" + return 1 + fi + + if ! kubectl apply -f "$tmp_yaml" >/dev/null 2>&1; then + log error "❌ Failed to apply ingress-dummy for ALB '$alb_name'" + log error "" + log error "💡 Possible causes:" + log error " The agent may lack permissions on Ingress resources or the namespace may not exist" + log error "" + log error "🔧 How to fix:" + log error " • kubectl apply -f $tmp_yaml" + log error " • Verify namespace '$namespace' exists" + rm -f "$tmp_context" "$tmp_yaml" + return 1 + fi + + rm -f "$tmp_context" "$tmp_yaml" + return 0 +} + +# Polls AWS until the ALB reaches the 'active' state, applying the autocreate +# tags once it is reachable. Returns 0 if active within timeout, 1 otherwise. +wait_for_alb_active() { + local alb_name="$1" + local timeout_seconds="$2" + local scope_id="$3" + local visibility="$4" + + local deadline=$(($(date +%s) + timeout_seconds)) + local poll_interval=10 + local state="" + local alb_arn="" + + log info "⏳ Waiting up to ${timeout_seconds}s for ALB '$alb_name' to become active..." + + while [ "$(date +%s)" -lt "$deadline" ]; do + alb_arn=$(aws elbv2 describe-load-balancers \ + --names "$alb_name" \ + --region "$REGION" \ + --query 'LoadBalancers[0].LoadBalancerArn' \ + --output text 2>/dev/null) || alb_arn="" + + if [ -n "$alb_arn" ] && [ "$alb_arn" != "None" ]; then + state=$(aws elbv2 describe-load-balancers \ + --names "$alb_name" \ + --region "$REGION" \ + --query 'LoadBalancers[0].State.Code' \ + --output text 2>/dev/null) || state="" + + log debug "📋 ALB '$alb_name' state: ${state:-pending}" + + if [ "$state" = "active" ]; then + log info "✅ ALB '$alb_name' is active" + tag_alb "$alb_arn" "$scope_id" "$visibility" || true + return 0 + fi + + if [ "$state" = "failed" ]; then + log error "❌ ALB '$alb_name' reached state 'failed'" + return 1 + fi + fi + + sleep "$poll_interval" + done + + log error "❌ Timed out after ${timeout_seconds}s waiting for ALB '$alb_name' to become active" + log error "" + log error "💡 Possible causes:" + log error " The AWS Load Balancer Controller may be slow, mis-configured, or the AWS account may be hitting an ALB quota" + log error "" + log error "🔧 How to fix:" + log error " • Check controller logs: kubectl -n kube-system logs deploy/aws-load-balancer-controller" + log error " • Verify ALB quota: aws service-quotas get-service-quota --service-code elasticloadbalancing --quota-code L-53DA6B97" + log error " • Inspect the dummy Ingress: kubectl get ingress -A -l alb_name=$alb_name" + return 1 +} + +# Tags the ALB so resolve_balancer can discover it in subsequent scope creations. +tag_alb() { + local alb_arn="$1" + local scope_id="$2" + local visibility="$3" + + aws elbv2 add-tags \ + --resource-arns "$alb_arn" \ + --region "$REGION" \ + --tags \ + "Key=nullplatform:managed-by,Value=autocreate" \ + "Key=nullplatform:visibility,Value=$visibility" \ + "Key=nullplatform:created-by-scope-id,Value=$scope_id" \ + >/dev/null 2>&1 || { + log warn "⚠️ Could not tag ALB; subsequent discovery may not find it" + return 1 + } + + log debug "📋 Tagged ALB with nullplatform:managed-by=autocreate" + return 0 +} + +# ============================================================================= +# Main +# ============================================================================= + +NAME_PREFIX=$(get_config_value \ + --env ALB_AUTOCREATE_NAME_PREFIX \ + --provider '.providers["container-orchestration"].balancer.autocreate_name_prefix' \ + --default "nullplatform-auto-" +) + +TIMEOUT_SECONDS=$(get_config_value \ + --env ALB_AUTOCREATE_TIMEOUT_SECONDS \ + --provider '.providers["container-orchestration"].balancer.autocreate_timeout_seconds' \ + --default "300" +) + +if ! [[ "$TIMEOUT_SECONDS" =~ ^[0-9]+$ ]]; then + log error "❌ ALB_AUTOCREATE_TIMEOUT_SECONDS must be a positive integer, got: '$TIMEOUT_SECONDS'" + return 1 +fi + +SCOPE_ID=$(echo "$CONTEXT" | jq -r '.scope.id // "unknown"') + +NEW_ALB_NAME=$(generate_alb_name "$NAME_PREFIX" "$INGRESS_VISIBILITY") +log info "🔧 Autocreating ALB '$NEW_ALB_NAME' (visibility=$INGRESS_VISIBILITY)" + +if ! apply_dummy_ingress "$NEW_ALB_NAME" "$INGRESS_VISIBILITY" "$K8S_NAMESPACE"; then + return 1 +fi + +if ! wait_for_alb_active "$NEW_ALB_NAME" "$TIMEOUT_SECONDS" "$SCOPE_ID" "$INGRESS_VISIBILITY"; then + return 1 +fi + +export AUTOCREATED_ALB_NAME="$NEW_ALB_NAME" diff --git a/k8s/scope/networking/resolve_balancer b/k8s/scope/networking/resolve_balancer index fd07a68f..57ddbac3 100755 --- a/k8s/scope/networking/resolve_balancer +++ b/k8s/scope/networking/resolve_balancer @@ -124,6 +124,32 @@ get_alb_from_route53() { echo "$alb_name" } +# Lists ALB names tagged as autocreated for the given visibility. Used to keep +# previously-autocreated ALBs in the candidate pool without requiring them to +# be declared in the provider configuration. +# Returns space-separated names on stdout (may be empty), non-zero on AWS failure. +get_autocreated_albs() { + local visibility="$1" + + local arns + arns=$(aws resourcegroupstaggingapi get-resources \ + --region "$REGION" \ + --resource-type-filters "elasticloadbalancing:loadbalancer" \ + --tag-filters \ + "Key=nullplatform:managed-by,Values=autocreate" \ + "Key=nullplatform:visibility,Values=$visibility" \ + --query 'ResourceTagMappingList[].ResourceARN' \ + --output text 2>/dev/null) || return 1 + + if [ -z "$arns" ] || [ "$arns" = "None" ]; then + return 0 + fi + + # ALB ARN format: arn:aws:elasticloadbalancing:::loadbalancer/app// + # Skip non-application LBs (would have loadbalancer/net/... or loadbalancer/gwy/...). + echo "$arns" | tr '\t' '\n' | awk -F/ '$2 == "app" { print $3 }' +} + # ============================================================================= # Main logic # ============================================================================= @@ -174,10 +200,26 @@ if [[ "$DNS_TYPE" == "route53" ]]; then ) fi + # Discovered ALBs: autocreated by previous scope creations and tagged so we + # can include them in the pool without provider config changes. + AUTOCREATED_BALANCERS=$(get_autocreated_albs "$INGRESS_VISIBILITY" 2>/dev/null) || AUTOCREATED_BALANCERS="" + + HAS_ADDITIONAL=0 if [ -n "$ADDITIONAL_BALANCERS" ] && [ "$ADDITIONAL_BALANCERS" != "null" ] && [ "$ADDITIONAL_BALANCERS" != "[]" ]; then - log debug "🔍 Additional balancers configured, resolving least-loaded ALB..." + HAS_ADDITIONAL=1 + fi + + if [ "$HAS_ADDITIONAL" -eq 1 ] || [ -n "$AUTOCREATED_BALANCERS" ]; then + log debug "🔍 Resolving least-loaded ALB across declared and autocreated candidates..." - CANDIDATES=$(echo "$ADDITIONAL_BALANCERS" | jq -r --arg base "$ALB_NAME" '[$base] + . | .[]') + if [ "$HAS_ADDITIONAL" -eq 1 ]; then + DECLARED_LIST=$(echo "$ADDITIONAL_BALANCERS" | jq -r --arg base "$ALB_NAME" '[$base] + . | .[]') + else + DECLARED_LIST="$ALB_NAME" + fi + + # Merge declared + autocreated, deduplicate, drop blanks. + CANDIDATES=$(printf '%s\n%s\n' "$DECLARED_LIST" "$AUTOCREATED_BALANCERS" | tr ' ' '\n' | awk 'NF && !seen[$0]++') log debug "📋 Candidate balancers: $(echo "$CANDIDATES" | paste -sd ',' - | sed 's/,/, /g')" @@ -203,6 +245,34 @@ if [[ "$DNS_TYPE" == "route53" ]]; then fi ALB_NAME="$BEST_ALB" + + # Autocreate fallback: every candidate is at or above the threshold and + # autocreate is enabled. Delegate to autocreate_alb to provision a new + # ALB and replace ALB_NAME with it. + AUTOCREATE_ENABLED=$(get_config_value \ + --env ALB_AUTOCREATE_ENABLED \ + --provider '.providers["container-orchestration"].balancer.autocreate_enabled' \ + --default "false" + ) + + MAX_CAPACITY=$(get_config_value \ + --env ALB_MAX_CAPACITY \ + --provider '.providers["scope-configurations"].networking.alb_max_capacity' \ + --provider '.providers["container-orchestration"].balancer.alb_capacity_threshold' \ + --default "75" + ) + + if [ "$AUTOCREATE_ENABLED" = "true" ] && [ "$MIN_RULES" -ge 0 ] && [ "$MIN_RULES" -ge "$MAX_CAPACITY" ]; then + log info "🔧 All candidate ALBs are at or above capacity ($MIN_RULES/$MAX_CAPACITY); triggering autocreate" + source "$_RESOLVE_BALANCER_DIR/autocreate_alb" || { + log error "❌ Autocreate failed; ALB '$ALB_NAME' will be used and validate_alb_capacity will reject the deployment" + return 1 + } + if [ -n "${AUTOCREATED_ALB_NAME:-}" ]; then + ALB_NAME="$AUTOCREATED_ALB_NAME" + log info "📝 Using autocreated ALB '$ALB_NAME'" + fi + fi fi fi else diff --git a/k8s/scope/templates/ingress-dummy.yaml.tpl b/k8s/scope/templates/ingress-dummy.yaml.tpl new file mode 100644 index 00000000..3d6bd3f8 --- /dev/null +++ b/k8s/scope/templates/ingress-dummy.yaml.tpl @@ -0,0 +1,28 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: nullplatform-autocreate-{{ .alb_name }} + namespace: {{ .k8s_namespace }} + labels: + nullplatform: "true" + nullplatform-autocreate: "true" + alb_name: {{ .alb_name }} + annotations: + alb.ingress.kubernetes.io/group.name: {{ .alb_name }} + alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80},{"HTTPS":443}]' + alb.ingress.kubernetes.io/load-balancer-name: {{ .alb_name }} + alb.ingress.kubernetes.io/scheme: {{ .ingress_visibility }} + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/tags: nullplatform:managed-by=autocreate,nullplatform:visibility={{ .ingress_visibility }},nullplatform:created-by-scope-id={{ .scope.id }} +spec: + ingressClassName: alb + rules: + - http: + paths: + - path: /__nullplatform_autocreate_placeholder + pathType: Prefix + backend: + service: + name: nullplatform-autocreate-placeholder + port: + number: 80 diff --git a/k8s/scope/tests/networking/autocreate_alb.bats b/k8s/scope/tests/networking/autocreate_alb.bats new file mode 100644 index 00000000..03e1db17 --- /dev/null +++ b/k8s/scope/tests/networking/autocreate_alb.bats @@ -0,0 +1,262 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for networking/autocreate_alb +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + source "$PROJECT_ROOT/k8s/utils/get_config_value" + + export SCRIPT="$PROJECT_ROOT/k8s/scope/networking/autocreate_alb" + export REGION="us-east-1" + export INGRESS_VISIBILITY="internet-facing" + export K8S_NAMESPACE="test-ns" + export SERVICE_PATH="$PROJECT_ROOT/k8s" + export ALB_AUTOCREATE_TIMEOUT_SECONDS="2" + + export CONTEXT='{ + "scope": { "id": "scope-1", "slug": "scope-1" }, + "namespace": { "id": "ns-1", "slug": "ns-1" }, + "application": { "id": "app-1", "slug": "app-1" }, + "account": { "id": "acc-1", "slug": "acc-1" }, + "deployment": { "id": "dep-1" }, + "providers": { + "container-orchestration": {} + } + }' + + # Mocks: each test overrides as needed. + gomplate() { return 0; } + export -f gomplate + kubectl() { return 0; } + export -f kubectl + aws() { return 1; } + export -f aws + + # Tracks calls for assertions. + export CALL_LOG_FILE="$(mktemp)" +} + +teardown() { + unset -f log gomplate kubectl aws get_config_value + rm -f "$CALL_LOG_FILE" + unset AUTOCREATED_ALB_NAME +} + +# Records each invocation of a mocked binary into CALL_LOG_FILE so tests can +# assert against the sequence of calls. +record_call() { + echo "$@" >> "$CALL_LOG_FILE" +} + +# ============================================================================= +# Name generation +# ============================================================================= +@test "autocreate_alb: generates ALB name with prefix and visibility short form" { + # Mock AWS so describe-load-balancers reports active immediately. + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/nullplatform-auto-public-abc123/x"; return 0 ;; + *add-tags*) return 0 ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + # Length and prefix + [[ "$AUTOCREATED_ALB_NAME" =~ ^nullplatform-auto-public-[a-f0-9]{6}$ ]] +} + +@test "autocreate_alb: uses private short form for internal visibility" { + export INGRESS_VISIBILITY="internal" + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *add-tags*) return 0 ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + [[ "$AUTOCREATED_ALB_NAME" =~ ^nullplatform-auto-private-[a-f0-9]{6}$ ]] +} + +@test "autocreate_alb: respects custom name prefix from env" { + export ALB_AUTOCREATE_NAME_PREFIX="custom-prefix-" + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *add-tags*) return 0 ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + [[ "$AUTOCREATED_ALB_NAME" =~ ^custom-prefix-public- ]] +} + +# ============================================================================= +# Ingress dummy application +# ============================================================================= +@test "autocreate_alb: renders and applies the dummy ingress before polling" { + gomplate() { record_call "gomplate $*"; return 0; } + export -f gomplate + kubectl() { record_call "kubectl $*"; return 0; } + export -f kubectl + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *add-tags*) return 0 ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + # Both gomplate and kubectl must have been invoked. + grep -q "gomplate" "$CALL_LOG_FILE" + grep -q "kubectl apply" "$CALL_LOG_FILE" +} + +@test "autocreate_alb: fails if gomplate render fails" { + gomplate() { return 1; } + export -f gomplate + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "Failed to render ingress-dummy template" +} + +@test "autocreate_alb: fails if kubectl apply fails" { + kubectl() { return 1; } + export -f kubectl + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "Failed to apply ingress-dummy" +} + +# ============================================================================= +# Polling for active state +# ============================================================================= +@test "autocreate_alb: returns success when ALB becomes active within timeout" { + # First describe call returns pending arn, then state=active. + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *add-tags*) return 0 ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + [ -n "$AUTOCREATED_ALB_NAME" ] +} + +@test "autocreate_alb: exits non-zero when ALB never reaches active state (timeout)" { + export ALB_AUTOCREATE_TIMEOUT_SECONDS="1" + # Always return pending state, never 'active'. + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "provisioning"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *) return 1 ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "Timed out" +} + +@test "autocreate_alb: exits non-zero when ALB reaches 'failed' state" { + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "failed"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *) return 1 ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "reached state 'failed'" +} + +# ============================================================================= +# Tagging +# ============================================================================= +@test "autocreate_alb: tags the ALB with managed-by, visibility and scope-id" { + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *add-tags*) + record_call "aws $*" + return 0 + ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + grep -q "nullplatform:managed-by,Value=autocreate" "$CALL_LOG_FILE" + grep -q "nullplatform:visibility,Value=internet-facing" "$CALL_LOG_FILE" + grep -q "nullplatform:created-by-scope-id,Value=scope-1" "$CALL_LOG_FILE" +} + +@test "autocreate_alb: tagging failure does not fail the script (warn only)" { + aws() { + case "$*" in + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *add-tags*) return 1 ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + # Script still exports the new ALB name even though tagging warned. + [ -n "$AUTOCREATED_ALB_NAME" ] +} + +# ============================================================================= +# Timeout validation +# ============================================================================= +@test "autocreate_alb: rejects non-numeric timeout" { + export ALB_AUTOCREATE_TIMEOUT_SECONDS="abc" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "must be a positive integer" +} diff --git a/k8s/scope/tests/networking/resolve_balancer.bats b/k8s/scope/tests/networking/resolve_balancer.bats index 676d8c67..fa35440a 100644 --- a/k8s/scope/tests/networking/resolve_balancer.bats +++ b/k8s/scope/tests/networking/resolve_balancer.bats @@ -368,7 +368,7 @@ mock_alb_rules() { run bash -c 'export LOG_LEVEL=debug; source "$SCRIPT"' - assert_contains "$output" "🔍 Additional balancers configured, resolving least-loaded ALB..." + assert_contains "$output" "🔍 Resolving least-loaded ALB across declared and autocreated candidates..." assert_contains "$output" "📋 Candidate balancers: co-balancer-public, alb-extra-1, alb-extra-2" } @@ -497,3 +497,214 @@ mock_alb_rules() { assert_contains "$output" "DNS type is 'external_dns', skipping Route53 lookup and load balancing" } + +# ============================================================================= +# Discovery of autocreated ALBs via tags +# ============================================================================= + +# Extends mock_alb_rules behavior to also serve a resourcegroupstaggingapi +# response listing autocreated ALBs as candidates. Pass "alb_name count" pairs +# as positional args; pass the discovered ALB names via the +# DISCOVERED_AUTOCREATED env var (space-separated). +mock_alb_rules_with_discovery() { + > "$MOCK_RULES_FILE" + for pair in "$@"; do + echo "$pair" >> "$MOCK_RULES_FILE" + done + local rules_file="$MOCK_RULES_FILE" + local discovered_arns="" + for name in $DISCOVERED_AUTOCREATED; do + discovered_arns="${discovered_arns}arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc"$'\t' + done + discovered_arns="${discovered_arns%$'\t'}" + + eval "aws() { + case \"\$*\" in + *list-resource-record-sets*) echo 'None'; return 0 ;; + *resourcegroupstaggingapi*get-resources*) + echo '${discovered_arns}' + return 0 + ;; + *describe-load-balancers*--names*) + local name='' + local prev='' + for arg in \"\$@\"; do + if [ \"\$prev\" = '--names' ]; then name=\"\$arg\"; fi + prev=\"\$arg\" + done + if ! grep -q \"^\${name} \" '${rules_file}' 2>/dev/null; then + return 1 + fi + echo \"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/\${name}/abc\" + return 0 + ;; + *describe-listeners*) + local lb_arn='' + local prev='' + for arg in \"\$@\"; do + if [ \"\$prev\" = '--load-balancer-arn' ]; then lb_arn=\"\$arg\"; fi + prev=\"\$arg\" + done + local alb_name=\$(echo \"\$lb_arn\" | sed 's|.*/app/||;s|/.*||') + echo \"arn:aws:elasticloadbalancing:us-east-1:123:listener/app/\${alb_name}/abc/def\" + return 0 + ;; + *describe-rules*) + local listener_arn='' + local prev='' + for arg in \"\$@\"; do + if [ \"\$prev\" = '--listener-arn' ]; then listener_arn=\"\$arg\"; fi + prev=\"\$arg\" + done + local alb_name=\$(echo \"\$listener_arn\" | sed 's|.*/app/||;s|/.*||') + local count=\$(grep \"^\${alb_name} \" '${rules_file}' | awk '{print \$2}') + if [ -z \"\$count\" ]; then + return 1 + fi + local rules='{\"Rules\": [{\"IsDefault\": true}' + local i=0 + while [ \$i -lt \$count ]; do + rules=\"\${rules}, {\\\"IsDefault\\\": false}\" + i=\$((i + 1)) + done + rules=\"\${rules}]}\" + echo \"\$rules\" + return 0 + ;; + *) return 1 ;; + esac + } + export -f aws" +} + +@test "resolve_balancer: discovers autocreated ALBs from tags and includes them as candidates" { + export INGRESS_VISIBILITY="internet-facing" + export DISCOVERED_AUTOCREATED="nullplatform-auto-public-aaaaaa" + mock_alb_rules_with_discovery "co-balancer-public 50" "nullplatform-auto-public-aaaaaa 5" + + source "$SCRIPT" + + assert_equal "$ALB_NAME" "nullplatform-auto-public-aaaaaa" +} + +@test "resolve_balancer: skips non-application LBs in discovery output" { + export INGRESS_VISIBILITY="internet-facing" + # Inject a network LB ARN that should be filtered out by the awk in get_autocreated_albs + aws() { + case "$*" in + *list-resource-record-sets*) echo "None"; return 0 ;; + *resourcegroupstaggingapi*get-resources*) + printf 'arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/net/some-nlb/abc\n' + return 0 + ;; + *) return 1 ;; + esac + } + export -f aws + + source "$SCRIPT" + + # No additional candidates → falls back to default + assert_equal "$ALB_NAME" "co-balancer-public" +} + +# ============================================================================= +# Autocreate fallback when all candidates are over capacity +# ============================================================================= +@test "resolve_balancer: triggers autocreate when all candidates over threshold and feature enabled" { + export INGRESS_VISIBILITY="internet-facing" + export ALB_AUTOCREATE_ENABLED="true" + export ALB_MAX_CAPACITY="50" + export ALB_AUTOCREATE_TIMEOUT_SECONDS="2" + export ALB_AUTOCREATE_NAME_PREFIX="auto-" + export K8S_NAMESPACE="test-ns" + export SERVICE_PATH="$PROJECT_ROOT/k8s" + export CONTEXT=$(echo "$CONTEXT" | jq ' + .providers["scope-configurations"].networking.additional_public_balancers = ["alb-extra-1"] | + .scope.slug = "s" | + .scope.id = "scope-1" | + .namespace.slug = "ns" | + .namespace.id = "ns-1" | + .application.slug = "app" | + .application.id = "app-1" | + .account.slug = "acc" | + .account.id = "acc-1" | + .deployment.id = "dep-1" + ') + + # Both declared ALBs are above threshold; aws describe for autocreate + # returns "active" immediately so the new ALB name is exported. + aws() { + case "$*" in + *list-resource-record-sets*) echo "None"; return 0 ;; + *resourcegroupstaggingapi*get-resources*) echo ""; return 0 ;; + *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; + *describe-load-balancers*--names*) + local name='' + local prev='' + for arg in "$@"; do + if [ "$prev" = "--names" ]; then name="$arg"; fi + prev="$arg" + done + echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc" + return 0 + ;; + *describe-listeners*) + local lb_arn='' + local prev='' + for arg in "$@"; do + if [ "$prev" = "--load-balancer-arn" ]; then lb_arn="$arg"; fi + prev="$arg" + done + local alb_name=$(echo "$lb_arn" | sed 's|.*/app/||;s|/.*||') + echo "arn:aws:elasticloadbalancing:us-east-1:123:listener/app/${alb_name}/abc/def" + return 0 + ;; + *describe-rules*) + # Both candidates report 60 rules (above threshold of 50) + echo '{"Rules":[{"IsDefault":true},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false}]}' + return 0 + ;; + *add-tags*) return 0 ;; + *) return 1 ;; + esac + } + export -f aws + gomplate() { return 0; } + export -f gomplate + kubectl() { return 0; } + export -f kubectl + + source "$SCRIPT" + + [[ "$ALB_NAME" =~ ^auto-public-[a-f0-9]{6}$ ]] +} + +@test "resolve_balancer: does not autocreate when feature disabled even if all candidates full" { + export INGRESS_VISIBILITY="internet-facing" + export ALB_AUTOCREATE_ENABLED="false" + export ALB_MAX_CAPACITY="50" + export CONTEXT=$(echo "$CONTEXT" | jq ' + .providers["scope-configurations"].networking.additional_public_balancers = ["alb-extra-1"] + ') + # Both above threshold but autocreate disabled → keeps least-loaded + mock_alb_rules "co-balancer-public 60" "alb-extra-1 55" + + source "$SCRIPT" + + assert_equal "$ALB_NAME" "alb-extra-1" +} + +@test "resolve_balancer: does not autocreate when at least one candidate below threshold" { + export INGRESS_VISIBILITY="internet-facing" + export ALB_AUTOCREATE_ENABLED="true" + export ALB_MAX_CAPACITY="50" + export CONTEXT=$(echo "$CONTEXT" | jq ' + .providers["scope-configurations"].networking.additional_public_balancers = ["alb-extra-1"] + ') + mock_alb_rules "co-balancer-public 60" "alb-extra-1 10" + + source "$SCRIPT" + + assert_equal "$ALB_NAME" "alb-extra-1" +} diff --git a/k8s/values.yaml b/k8s/values.yaml index d053bc0a..5700b427 100644 --- a/k8s/values.yaml +++ b/k8s/values.yaml @@ -19,6 +19,12 @@ configuration: ALB_MAX_LISTENERS: 48 ALB_METRICS_PUBLISH_ENABLED: false # ALB_METRICS_PUBLISH_TARGET: cloudwatch # Available values: cloudwatch | datadog + ALB_AUTOCREATE_ENABLED: false +# When true and every candidate ALB is at or above ALB_MAX_CAPACITY, the scope +# creation provisions a new ALB via a dummy Ingress and waits for it to become +# active. Previously-autocreated ALBs are auto-discovered by AWS tag. + ALB_AUTOCREATE_NAME_PREFIX: nullplatform-auto- + ALB_AUTOCREATE_TIMEOUT_SECONDS: 300 DEPLOYMENT_MAX_WAIT_IN_SECONDS: 600 DEPLOYMENT_TEMPLATE: "$SERVICE_PATH/deployment/templates/deployment.yaml.tpl" SECRET_TEMPLATE: "$SERVICE_PATH/deployment/templates/secret.yaml.tpl" From 59821c36acb6e47fb152211f99d765cf36b298f9 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 13:53:06 -0300 Subject: [PATCH 02/14] fix: harden autocreate input validation and merge AWS polling call --- k8s/scope/networking/autocreate_alb | 39 ++-- k8s/scope/networking/resolve_balancer | 17 +- .../tests/networking/autocreate_alb.bats | 61 +++-- .../tests/networking/resolve_balancer.bats | 210 ++++++++---------- 4 files changed, 168 insertions(+), 159 deletions(-) diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb index 1b6ffacb..5c7395d0 100644 --- a/k8s/scope/networking/autocreate_alb +++ b/k8s/scope/networking/autocreate_alb @@ -45,7 +45,7 @@ generate_alb_name() { local suffix suffix=$(LC_ALL=C tr -dc 'a-f0-9' < /dev/urandom 2>/dev/null | head -c 6) || { - suffix=$(printf '%06x' $((RANDOM * RANDOM % 0xFFFFFF))) + suffix=$(printf '%06x' $((RANDOM * RANDOM % 16777215))) } echo "${prefix}${vis_short}-${suffix}" @@ -61,7 +61,8 @@ apply_dummy_ingress() { local tmp_context tmp_context=$(mktemp) || return 1 local tmp_yaml - tmp_yaml=$(mktemp --suffix=.yaml 2>/dev/null) || tmp_yaml="$(mktemp).yaml" + tmp_yaml=$(mktemp) || return 1 + trap 'rm -f "$tmp_context" "$tmp_yaml"' RETURN echo "$CONTEXT" | jq \ --arg alb_name "$alb_name" \ @@ -75,24 +76,19 @@ apply_dummy_ingress() { if ! gomplate -c .="$tmp_context" --file "$template_path" --out "$tmp_yaml" 2>/dev/null; then log error "❌ Failed to render ingress-dummy template" log error "📋 Template: $template_path" - rm -f "$tmp_context" "$tmp_yaml" return 1 fi if ! kubectl apply -f "$tmp_yaml" >/dev/null 2>&1; then log error "❌ Failed to apply ingress-dummy for ALB '$alb_name'" - log error "" log error "💡 Possible causes:" log error " The agent may lack permissions on Ingress resources or the namespace may not exist" - log error "" log error "🔧 How to fix:" log error " • kubectl apply -f $tmp_yaml" log error " • Verify namespace '$namespace' exists" - rm -f "$tmp_context" "$tmp_yaml" return 1 fi - rm -f "$tmp_context" "$tmp_yaml" return 0 } @@ -112,19 +108,16 @@ wait_for_alb_active() { log info "⏳ Waiting up to ${timeout_seconds}s for ALB '$alb_name' to become active..." while [ "$(date +%s)" -lt "$deadline" ]; do - alb_arn=$(aws elbv2 describe-load-balancers \ + local lb_json + lb_json=$(aws elbv2 describe-load-balancers \ --names "$alb_name" \ --region "$REGION" \ - --query 'LoadBalancers[0].LoadBalancerArn' \ - --output text 2>/dev/null) || alb_arn="" + --output json 2>/dev/null) || lb_json="" - if [ -n "$alb_arn" ] && [ "$alb_arn" != "None" ]; then - state=$(aws elbv2 describe-load-balancers \ - --names "$alb_name" \ - --region "$REGION" \ - --query 'LoadBalancers[0].State.Code' \ - --output text 2>/dev/null) || state="" + alb_arn=$(echo "$lb_json" | jq -r '.LoadBalancers[0].LoadBalancerArn // empty' 2>/dev/null) || alb_arn="" + state=$(echo "$lb_json" | jq -r '.LoadBalancers[0].State.Code // empty' 2>/dev/null) || state="" + if [ -n "$alb_arn" ] && [ "$alb_arn" != "None" ]; then log debug "📋 ALB '$alb_name' state: ${state:-pending}" if [ "$state" = "active" ]; then @@ -192,11 +185,23 @@ TIMEOUT_SECONDS=$(get_config_value \ --default "300" ) -if ! [[ "$TIMEOUT_SECONDS" =~ ^[0-9]+$ ]]; then +if ! [[ "$TIMEOUT_SECONDS" =~ ^[1-9][0-9]*$ ]]; then log error "❌ ALB_AUTOCREATE_TIMEOUT_SECONDS must be a positive integer, got: '$TIMEOUT_SECONDS'" return 1 fi +# Final ALB name is "-<6 hex>", which must satisfy AWS +# ALB naming rules ([a-zA-Z0-9-], ≤32 chars) and the k8s metadata.name rules +# used by the dummy Ingress. 14 = length of "private-" + 6 hex chars. +if ! [[ "$NAME_PREFIX" =~ ^[a-z0-9-]+$ ]]; then + log error "❌ ALB_AUTOCREATE_NAME_PREFIX must match ^[a-z0-9-]+$, got: '$NAME_PREFIX'" + return 1 +fi +if [ "${#NAME_PREFIX}" -gt 18 ]; then + log error "❌ ALB_AUTOCREATE_NAME_PREFIX must be ≤18 chars (AWS caps ALB names at 32, the visibility+hex suffix uses 14); got ${#NAME_PREFIX}" + return 1 +fi + SCOPE_ID=$(echo "$CONTEXT" | jq -r '.scope.id // "unknown"') NEW_ALB_NAME=$(generate_alb_name "$NAME_PREFIX" "$INGRESS_VISIBILITY") diff --git a/k8s/scope/networking/resolve_balancer b/k8s/scope/networking/resolve_balancer index 57ddbac3..2699f459 100755 --- a/k8s/scope/networking/resolve_balancer +++ b/k8s/scope/networking/resolve_balancer @@ -204,15 +204,15 @@ if [[ "$DNS_TYPE" == "route53" ]]; then # can include them in the pool without provider config changes. AUTOCREATED_BALANCERS=$(get_autocreated_albs "$INGRESS_VISIBILITY" 2>/dev/null) || AUTOCREATED_BALANCERS="" - HAS_ADDITIONAL=0 + HAS_ADDITIONAL=false if [ -n "$ADDITIONAL_BALANCERS" ] && [ "$ADDITIONAL_BALANCERS" != "null" ] && [ "$ADDITIONAL_BALANCERS" != "[]" ]; then - HAS_ADDITIONAL=1 + HAS_ADDITIONAL=true fi - if [ "$HAS_ADDITIONAL" -eq 1 ] || [ -n "$AUTOCREATED_BALANCERS" ]; then + if [ "$HAS_ADDITIONAL" = "true" ] || [ -n "$AUTOCREATED_BALANCERS" ]; then log debug "🔍 Resolving least-loaded ALB across declared and autocreated candidates..." - if [ "$HAS_ADDITIONAL" -eq 1 ]; then + if [ "$HAS_ADDITIONAL" = "true" ]; then DECLARED_LIST=$(echo "$ADDITIONAL_BALANCERS" | jq -r --arg base "$ALB_NAME" '[$base] + . | .[]') else DECLARED_LIST="$ALB_NAME" @@ -262,7 +262,14 @@ if [[ "$DNS_TYPE" == "route53" ]]; then --default "75" ) - if [ "$AUTOCREATE_ENABLED" = "true" ] && [ "$MIN_RULES" -ge 0 ] && [ "$MIN_RULES" -ge "$MAX_CAPACITY" ]; then + # Without this guard a non-numeric MAX_CAPACITY would silently disable the + # autocreate trigger (the `-ge` comparison errors out and evaluates false). + if ! [[ "$MAX_CAPACITY" =~ ^[0-9]+$ ]]; then + log warn "⚠️ ALB_MAX_CAPACITY must be numeric, got: '$MAX_CAPACITY' — skipping autocreate evaluation" + MAX_CAPACITY="" + fi + + if [ "$AUTOCREATE_ENABLED" = "true" ] && [ -n "$MAX_CAPACITY" ] && [ "$MIN_RULES" -ge 0 ] && [ "$MIN_RULES" -ge "$MAX_CAPACITY" ]; then log info "🔧 All candidate ALBs are at or above capacity ($MIN_RULES/$MAX_CAPACITY); triggering autocreate" source "$_RESOLVE_BALANCER_DIR/autocreate_alb" || { log error "❌ Autocreate failed; ALB '$ALB_NAME' will be used and validate_alb_capacity will reject the deployment" diff --git a/k8s/scope/tests/networking/autocreate_alb.bats b/k8s/scope/tests/networking/autocreate_alb.bats index 03e1db17..c5c47aa2 100644 --- a/k8s/scope/tests/networking/autocreate_alb.bats +++ b/k8s/scope/tests/networking/autocreate_alb.bats @@ -61,8 +61,7 @@ record_call() { # Mock AWS so describe-load-balancers reports active immediately. aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/nullplatform-auto-public-abc123/x"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/nullplatform-auto-public-abc123/x","State":{"Code":"active"}}]}'; return 0 ;; *add-tags*) return 0 ;; *) return 1 ;; esac @@ -79,8 +78,7 @@ record_call() { export INGRESS_VISIBILITY="internal" aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; *add-tags*) return 0 ;; *) return 1 ;; esac @@ -96,8 +94,7 @@ record_call() { export ALB_AUTOCREATE_NAME_PREFIX="custom-prefix-" aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; *add-tags*) return 0 ;; *) return 1 ;; esac @@ -119,8 +116,7 @@ record_call() { export -f kubectl aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; *add-tags*) return 0 ;; *) return 1 ;; esac @@ -158,11 +154,10 @@ record_call() { # Polling for active state # ============================================================================= @test "autocreate_alb: returns success when ALB becomes active within timeout" { - # First describe call returns pending arn, then state=active. + # describe-load-balancers returns active state immediately. aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; *add-tags*) return 0 ;; *) return 1 ;; esac @@ -176,11 +171,10 @@ record_call() { @test "autocreate_alb: exits non-zero when ALB never reaches active state (timeout)" { export ALB_AUTOCREATE_TIMEOUT_SECONDS="1" - # Always return pending state, never 'active'. + # Always return provisioning state, never 'active'. aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "provisioning"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"provisioning"}}]}'; return 0 ;; *) return 1 ;; esac } @@ -195,8 +189,7 @@ record_call() { @test "autocreate_alb: exits non-zero when ALB reaches 'failed' state" { aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "failed"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"failed"}}]}'; return 0 ;; *) return 1 ;; esac } @@ -214,8 +207,7 @@ record_call() { @test "autocreate_alb: tags the ALB with managed-by, visibility and scope-id" { aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; *add-tags*) record_call "aws $*" return 0 @@ -235,8 +227,7 @@ record_call() { @test "autocreate_alb: tagging failure does not fail the script (warn only)" { aws() { case "$*" in - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; - *describe-load-balancers*) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y"; return 0 ;; + *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; *add-tags*) return 1 ;; *) return 1 ;; esac @@ -260,3 +251,33 @@ record_call() { [ "$status" -ne 0 ] assert_contains "$output" "must be a positive integer" } + +# ============================================================================= +# Name prefix validation +# ============================================================================= +@test "autocreate_alb: rejects prefix containing uppercase" { + export ALB_AUTOCREATE_NAME_PREFIX="Bad-Prefix-" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "must match" +} + +@test "autocreate_alb: rejects prefix containing colon (YAML injection vector)" { + export ALB_AUTOCREATE_NAME_PREFIX="bad:prefix" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "must match" +} + +@test "autocreate_alb: rejects prefix longer than 18 chars" { + export ALB_AUTOCREATE_NAME_PREFIX="this-prefix-is-way-too-long-" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "18 chars" +} diff --git a/k8s/scope/tests/networking/resolve_balancer.bats b/k8s/scope/tests/networking/resolve_balancer.bats index fa35440a..c37888c1 100644 --- a/k8s/scope/tests/networking/resolve_balancer.bats +++ b/k8s/scope/tests/networking/resolve_balancer.bats @@ -84,75 +84,79 @@ mock_route53_alb() { export -f aws" } +# Handles the three elbv2 describe-* calls used by get_alb_rule_count. +# Reads rule counts from MOCK_RULES_FILE ("alb_name count" lines). +# Returns non-zero when an ALB name is not found in MOCK_RULES_FILE. +_mock_aws_elbv2_rule_count() { + case "$*" in + *describe-load-balancers*--names*) + local name="" prev="" + for arg in "$@"; do + if [ "$prev" = "--names" ]; then name="$arg"; fi + prev="$arg" + done + if ! grep -q "^${name} " "$MOCK_RULES_FILE" 2>/dev/null; then + return 1 + fi + echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc" + ;; + *describe-listeners*) + local lb_arn="" prev="" + for arg in "$@"; do + if [ "$prev" = "--load-balancer-arn" ]; then lb_arn="$arg"; fi + prev="$arg" + done + local alb_name + alb_name=$(echo "$lb_arn" | sed 's|.*/app/||;s|/.*||') + echo "arn:aws:elasticloadbalancing:us-east-1:123:listener/app/${alb_name}/abc/def" + ;; + *describe-rules*) + local listener_arn="" prev="" + for arg in "$@"; do + if [ "$prev" = "--listener-arn" ]; then listener_arn="$arg"; fi + prev="$arg" + done + local alb_name + alb_name=$(echo "$listener_arn" | sed 's|.*/app/||;s|/.*||') + local count + count=$(grep "^${alb_name} " "$MOCK_RULES_FILE" | awk '{print $2}') + if [ -z "$count" ]; then + return 1 + fi + local rules='{"Rules": [{"IsDefault": true}' + local i=0 + while [ "$i" -lt "$count" ]; do + rules="${rules}, {\"IsDefault\": false}" + i=$((i + 1)) + done + echo "${rules}]}" + ;; + *) + return 1 + ;; + esac +} +export -f _mock_aws_elbv2_rule_count + # Sets up aws mock with no Route53 record but with rule counts for ALBs. -# Write rule counts to MOCK_RULES_FILE as "alb_name count" lines. -# The mock returns the ALB ARN with the name embedded so describe-rules -# can look up the correct rule count. +# Write rule counts to MOCK_RULES_FILE as "alb_name count" lines before calling. +# The mock returns ARNs with the name embedded so describe-rules can look up counts. mock_alb_rules() { > "$MOCK_RULES_FILE" for pair in "$@"; do echo "$pair" >> "$MOCK_RULES_FILE" done - local rules_file="$MOCK_RULES_FILE" - eval "aws() { - case \"\$*\" in - *list-resource-record-sets*) - echo 'None' - return 0 - ;; - *describe-load-balancers*--names*) - local name='' - local prev='' - for arg in \"\$@\"; do - if [ \"\$prev\" = '--names' ]; then name=\"\$arg\"; fi - prev=\"\$arg\" - done - if ! grep -q \"^\${name} \" '${rules_file}' 2>/dev/null; then - return 1 - fi - echo \"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/\${name}/abc\" - return 0 - ;; - *describe-listeners*) - local lb_arn='' - local prev='' - for arg in \"\$@\"; do - if [ \"\$prev\" = '--load-balancer-arn' ]; then lb_arn=\"\$arg\"; fi - prev=\"\$arg\" - done - local alb_name=\$(echo \"\$lb_arn\" | sed 's|.*/app/||;s|/.*||') - echo \"arn:aws:elasticloadbalancing:us-east-1:123:listener/app/\${alb_name}/abc/def\" - return 0 - ;; - *describe-rules*) - local listener_arn='' - local prev='' - for arg in \"\$@\"; do - if [ \"\$prev\" = '--listener-arn' ]; then listener_arn=\"\$arg\"; fi - prev=\"\$arg\" - done - local alb_name=\$(echo \"\$listener_arn\" | sed 's|.*/app/||;s|/.*||') - local count=\$(grep \"^\${alb_name} \" '${rules_file}' | awk '{print \$2}') - if [ -z \"\$count\" ]; then - return 1 - fi - local rules='{\"Rules\": [{\"IsDefault\": true}' - local i=0 - while [ \$i -lt \$count ]; do - rules=\"\${rules}, {\\\"IsDefault\\\": false}\" - i=\$((i + 1)) - done - rules=\"\${rules}]}\" - echo \"\$rules\" - return 0 - ;; - *) - return 1 + aws() { + case "$*" in + *list-resource-record-sets*) echo "None" ;; + *describe-load-balancers*--names* | *describe-listeners* | *describe-rules*) + _mock_aws_elbv2_rule_count "$@" ;; + *) return 1 ;; esac } - export -f aws" + export -f aws } # ============================================================================= @@ -502,16 +506,13 @@ mock_alb_rules() { # Discovery of autocreated ALBs via tags # ============================================================================= -# Extends mock_alb_rules behavior to also serve a resourcegroupstaggingapi -# response listing autocreated ALBs as candidates. Pass "alb_name count" pairs -# as positional args; pass the discovered ALB names via the -# DISCOVERED_AUTOCREATED env var (space-separated). +# Extends mock_alb_rules to also serve a resourcegroupstaggingapi response +# listing autocreated ALBs as candidates. Pass "alb_name count" pairs as +# positional args; set DISCOVERED_AUTOCREATED (space-separated names) before +# calling to control which ALBs the tag-discovery API returns. mock_alb_rules_with_discovery() { - > "$MOCK_RULES_FILE" - for pair in "$@"; do - echo "$pair" >> "$MOCK_RULES_FILE" - done - local rules_file="$MOCK_RULES_FILE" + mock_alb_rules "$@" + local discovered_arns="" for name in $DISCOVERED_AUTOCREATED; do discovered_arns="${discovered_arns}arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc"$'\t' @@ -520,56 +521,13 @@ mock_alb_rules_with_discovery() { eval "aws() { case \"\$*\" in - *list-resource-record-sets*) echo 'None'; return 0 ;; *resourcegroupstaggingapi*get-resources*) echo '${discovered_arns}' return 0 ;; - *describe-load-balancers*--names*) - local name='' - local prev='' - for arg in \"\$@\"; do - if [ \"\$prev\" = '--names' ]; then name=\"\$arg\"; fi - prev=\"\$arg\" - done - if ! grep -q \"^\${name} \" '${rules_file}' 2>/dev/null; then - return 1 - fi - echo \"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/\${name}/abc\" - return 0 - ;; - *describe-listeners*) - local lb_arn='' - local prev='' - for arg in \"\$@\"; do - if [ \"\$prev\" = '--load-balancer-arn' ]; then lb_arn=\"\$arg\"; fi - prev=\"\$arg\" - done - local alb_name=\$(echo \"\$lb_arn\" | sed 's|.*/app/||;s|/.*||') - echo \"arn:aws:elasticloadbalancing:us-east-1:123:listener/app/\${alb_name}/abc/def\" - return 0 - ;; - *describe-rules*) - local listener_arn='' - local prev='' - for arg in \"\$@\"; do - if [ \"\$prev\" = '--listener-arn' ]; then listener_arn=\"\$arg\"; fi - prev=\"\$arg\" - done - local alb_name=\$(echo \"\$listener_arn\" | sed 's|.*/app/||;s|/.*||') - local count=\$(grep \"^\${alb_name} \" '${rules_file}' | awk '{print \$2}') - if [ -z \"\$count\" ]; then - return 1 - fi - local rules='{\"Rules\": [{\"IsDefault\": true}' - local i=0 - while [ \$i -lt \$count ]; do - rules=\"\${rules}, {\\\"IsDefault\\\": false}\" - i=\$((i + 1)) - done - rules=\"\${rules}]}\" - echo \"\$rules\" - return 0 + *list-resource-record-sets*) echo 'None' ;; + *describe-load-balancers*--names* | *describe-listeners* | *describe-rules*) + _mock_aws_elbv2_rule_count \"\$@\" ;; *) return 1 ;; esac @@ -638,7 +596,6 @@ mock_alb_rules_with_discovery() { case "$*" in *list-resource-record-sets*) echo "None"; return 0 ;; *resourcegroupstaggingapi*get-resources*) echo ""; return 0 ;; - *describe-load-balancers*--query*State.Code*) echo "active"; return 0 ;; *describe-load-balancers*--names*) local name='' local prev='' @@ -646,8 +603,12 @@ mock_alb_rules_with_discovery() { if [ "$prev" = "--names" ]; then name="$arg"; fi prev="$arg" done - echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc" - return 0 + # Return JSON when called with --output json (autocreate polling), + # or a plain ARN when called with --output text (rule count lookup). + case "$*" in + *--output*json*) echo "{\"LoadBalancers\":[{\"LoadBalancerArn\":\"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc\",\"State\":{\"Code\":\"active\"}}]}"; return 0 ;; + *) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc"; return 0 ;; + esac ;; *describe-listeners*) local lb_arn='' @@ -708,3 +669,18 @@ mock_alb_rules_with_discovery() { assert_equal "$ALB_NAME" "alb-extra-1" } + +@test "resolve_balancer: warns and skips autocreate when ALB_MAX_CAPACITY is non-numeric" { + export INGRESS_VISIBILITY="internet-facing" + export ALB_AUTOCREATE_ENABLED="true" + export ALB_MAX_CAPACITY="not-a-number" + export CONTEXT=$(echo "$CONTEXT" | jq ' + .providers["scope-configurations"].networking.additional_public_balancers = ["alb-extra-1"] + ') + mock_alb_rules "co-balancer-public 60" "alb-extra-1 55" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -eq 0 ] + assert_contains "$output" "ALB_MAX_CAPACITY must be numeric" +} From 56152e50b9162052f640fe3cddc9cb1c5b2d9ca7 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Mon, 8 Jun 2026 14:32:04 -0300 Subject: [PATCH 03/14] refactor: address PR review feedback on autocreate flow --- k8s/scope/build_context | 15 +- k8s/scope/networking/autocreate_alb | 254 ++++++-------- k8s/scope/networking/resolve_balancer | 97 ++---- k8s/scope/networking/wait_for_alb | 86 +++++ k8s/scope/templates/ingress-dummy.yaml.tpl | 14 +- .../tests/networking/autocreate_alb.bats | 316 ++++++++---------- .../tests/networking/resolve_balancer.bats | 162 +++------ k8s/scope/tests/networking/wait_for_alb.bats | 140 ++++++++ k8s/scope/workflows/create.yaml | 14 + 9 files changed, 586 insertions(+), 512 deletions(-) create mode 100644 k8s/scope/networking/wait_for_alb create mode 100644 k8s/scope/tests/networking/wait_for_alb.bats diff --git a/k8s/scope/build_context b/k8s/scope/build_context index 8328eab6..548ca319 100755 --- a/k8s/scope/build_context +++ b/k8s/scope/build_context @@ -208,6 +208,13 @@ K8S_MODIFIERS=$(get_config_value \ ) K8S_MODIFIERS=$(echo "$K8S_MODIFIERS" | jq .) +OUTPUT_DIR="$SERVICE_PATH/output/$SCOPE_ID" +if [ -n "${NP_OUTPUT_DIR:-}" ]; then + OUTPUT_DIR="$NP_OUTPUT_DIR/output/$SCOPE_ID" +fi +mkdir -p "$OUTPUT_DIR" +export OUTPUT_DIR + source "$SCRIPT_DIR/networking/resolve_balancer" NAMESPACE_SLUG=$(echo "$CONTEXT" | jq -r .namespace.slug) @@ -227,15 +234,7 @@ CONTEXT=$(echo "$CONTEXT" | jq \ --argjson modifiers "$K8S_MODIFIERS" \ '. + {ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, gateway_name: $gateway_name, region: $region, k8s_modifiers: $modifiers, alb_name: $alb_name, component: $component}') -OUTPUT_DIR="$SERVICE_PATH/output/$SCOPE_ID" -if [ -n "${NP_OUTPUT_DIR:-}" ]; then - OUTPUT_DIR="$NP_OUTPUT_DIR/output/$SCOPE_ID" -fi - -export OUTPUT_DIR export CONTEXT export REGION -mkdir -p "$OUTPUT_DIR" - log info "✅ Scope context built successfully" diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb index 5c7395d0..aea35f6d 100644 --- a/k8s/scope/networking/autocreate_alb +++ b/k8s/scope/networking/autocreate_alb @@ -1,37 +1,32 @@ #!/bin/bash -# Creates a new ALB on demand when the existing pool is exhausted. +# Provisions a new ALB on demand when the existing pool is exhausted. # -# The script applies a dummy Ingress that the AWS Load Balancer Controller -# uses as the trigger to provision a new ALB. It then polls AWS until the -# ALB reaches the `active` state before returning the new ALB name to the -# caller via the AUTOCREATED_ALB_NAME output variable. +# Workflow contract: +# 1. Generates a unique ALB name. +# 2. Patches the container-orchestration provider via `np provider patch` so +# the new ALB is recorded as an additional balancer. Subsequent scope +# creations that read the provider see the new ALB and re-use it instead +# of triggering another autocreate. +# 3. Renders the dummy-ingress template to $OUTPUT_DIR. The next workflow +# step (apply_templates) applies it; that is what triggers the AWS Load +# Balancer Controller to actually provision the ALB. +# 4. Exports ALB_NAME (the new name) and ALB_AUTOCREATED=true so the wait +# step downstream knows to poll for active state and tag the ALB. # # Inputs (env vars): -# CONTEXT - Scope CONTEXT JSON (with .scope, .k8s_namespace, etc.) +# CONTEXT - Scope CONTEXT JSON # INGRESS_VISIBILITY - "internet-facing" or "internal" -# REGION - AWS region -# ALB_AUTOCREATE_NAME_PREFIX - Prefix for the new ALB (default: nullplatform-auto-) -# ALB_AUTOCREATE_TIMEOUT_SECONDS - Max seconds to wait for ALB active (default: 300) -# K8S_NAMESPACE - Namespace to place the dummy Ingress in +# K8S_NAMESPACE - Namespace for the dummy Ingress +# OUTPUT_DIR - Where the rendered YAML is written +# ALB_AUTOCREATE_NAME_PREFIX - Optional prefix for the new ALB # # Outputs (env vars): -# AUTOCREATED_ALB_NAME - Name of the ALB that was created +# ALB_NAME - Replaced with the new ALB name +# ALB_AUTOCREATED - Set to "true" _AUTOCREATE_ALB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -if ! type -t log >/dev/null 2>&1; then - source "$_AUTOCREATE_ALB_DIR/../../logging" -fi - -if ! type -t get_config_value >/dev/null 2>&1; then - source "$_AUTOCREATE_ALB_DIR/../../utils/get_config_value" -fi - -# Generates a unique-enough ALB name for autocreation. -# Format: -<6-hex> -# Visibility short: "public" for internet-facing, "private" for internal. -# Total length kept under 32 chars (AWS ALB name limit). generate_alb_name() { local prefix="$1" local visibility="$2" @@ -51,122 +46,103 @@ generate_alb_name() { echo "${prefix}${vis_short}-${suffix}" } -# Renders the dummy Ingress template and applies it to the cluster. -# Returns 0 on success, non-zero on failure (template render or kubectl apply). -apply_dummy_ingress() { +# Reads the container-orchestration provider id and current additional balancer +# list for the requested visibility, then patches the provider to append the new +# ALB name. Surfaces the existing balancer.* attributes so the patch is a +# merge of the full balancer object, not a destructive replacement. +register_alb_in_provider() { + local new_alb_name="$1" + local visibility="$2" + local nrn + nrn=$(echo "$CONTEXT" | jq -r '.scope.nrn // empty') + if [ -z "$nrn" ]; then + log error "❌ Could not read scope NRN from CONTEXT — cannot patch provider" + exit 1 + fi + + local provider_json + provider_json=$(np provider list --categories container-orchestration --nrn "$nrn" --format json 2>/dev/null) || { + log error "❌ Failed to list container-orchestration provider for NRN '$nrn'" + exit 1 + } + + local provider_id + provider_id=$(echo "$provider_json" | jq -r '.results[0].id // empty') + if [ -z "$provider_id" ]; then + log error "❌ No container-orchestration provider found for NRN '$nrn'" + exit 1 + fi + + local field + if [ "$visibility" = "internet-facing" ]; then + field="additional_public_names" + else + field="additional_private_names" + fi + + # Merge: current balancer.* + appended ALB in the right field. + local patch_body + patch_body=$(echo "$provider_json" | jq -c \ + --arg field "$field" \ + --arg new_alb "$new_alb_name" \ + '{ + attributes: { + balancer: ( + (.results[0].attributes.balancer // {}) as $bal | + $bal + { ($field): (($bal[$field] // []) + [$new_alb] | unique) } + ) + } + }') + + log info "📝 Registering ALB '$new_alb_name' in container-orchestration provider ($field)" + if ! np provider patch --id "$provider_id" --body "$patch_body" --no-output 2>/dev/null; then + log error "❌ Failed to patch container-orchestration provider with new ALB" + log error "💡 Possible causes: agent lacks write permission on the provider, or NP_TOKEN/NULLPLATFORM_API_KEY is missing" + exit 1 + fi +} + +render_dummy_ingress() { local alb_name="$1" local visibility="$2" local namespace="$3" + if [ -z "${OUTPUT_DIR:-}" ]; then + log error "❌ OUTPUT_DIR is not set — autocreate_alb must run after OUTPUT_DIR is exported" + exit 1 + fi + mkdir -p "$OUTPUT_DIR" + local tmp_context - tmp_context=$(mktemp) || return 1 - local tmp_yaml - tmp_yaml=$(mktemp) || return 1 - trap 'rm -f "$tmp_context" "$tmp_yaml"' RETURN + tmp_context=$(mktemp) + trap 'rm -f "$tmp_context"' RETURN + + local base_domain + base_domain=$(get_config_value \ + --env DOMAIN \ + --provider '.providers["scope-configurations"].networking.base_domain' \ + --provider '.providers["container-orchestration"].networking.base_domain' \ + --default "nullapps.io" + ) echo "$CONTEXT" | jq \ --arg alb_name "$alb_name" \ --arg ingress_visibility "$visibility" \ --arg k8s_namespace "$namespace" \ - '. + {alb_name: $alb_name, ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace}' \ + --arg base_domain "$base_domain" \ + '. + {alb_name: $alb_name, ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, base_domain: $base_domain}' \ > "$tmp_context" local template_path="${INGRESS_DUMMY_TEMPLATE:-$SERVICE_PATH/scope/templates/ingress-dummy.yaml.tpl}" + local out_path="$OUTPUT_DIR/ingress-dummy-${alb_name}.yaml" - if ! gomplate -c .="$tmp_context" --file "$template_path" --out "$tmp_yaml" 2>/dev/null; then + if ! gomplate -c .="$tmp_context" --file "$template_path" --out "$out_path"; then log error "❌ Failed to render ingress-dummy template" log error "📋 Template: $template_path" - return 1 - fi - - if ! kubectl apply -f "$tmp_yaml" >/dev/null 2>&1; then - log error "❌ Failed to apply ingress-dummy for ALB '$alb_name'" - log error "💡 Possible causes:" - log error " The agent may lack permissions on Ingress resources or the namespace may not exist" - log error "🔧 How to fix:" - log error " • kubectl apply -f $tmp_yaml" - log error " • Verify namespace '$namespace' exists" - return 1 + exit 1 fi - return 0 -} - -# Polls AWS until the ALB reaches the 'active' state, applying the autocreate -# tags once it is reachable. Returns 0 if active within timeout, 1 otherwise. -wait_for_alb_active() { - local alb_name="$1" - local timeout_seconds="$2" - local scope_id="$3" - local visibility="$4" - - local deadline=$(($(date +%s) + timeout_seconds)) - local poll_interval=10 - local state="" - local alb_arn="" - - log info "⏳ Waiting up to ${timeout_seconds}s for ALB '$alb_name' to become active..." - - while [ "$(date +%s)" -lt "$deadline" ]; do - local lb_json - lb_json=$(aws elbv2 describe-load-balancers \ - --names "$alb_name" \ - --region "$REGION" \ - --output json 2>/dev/null) || lb_json="" - - alb_arn=$(echo "$lb_json" | jq -r '.LoadBalancers[0].LoadBalancerArn // empty' 2>/dev/null) || alb_arn="" - state=$(echo "$lb_json" | jq -r '.LoadBalancers[0].State.Code // empty' 2>/dev/null) || state="" - - if [ -n "$alb_arn" ] && [ "$alb_arn" != "None" ]; then - log debug "📋 ALB '$alb_name' state: ${state:-pending}" - - if [ "$state" = "active" ]; then - log info "✅ ALB '$alb_name' is active" - tag_alb "$alb_arn" "$scope_id" "$visibility" || true - return 0 - fi - - if [ "$state" = "failed" ]; then - log error "❌ ALB '$alb_name' reached state 'failed'" - return 1 - fi - fi - - sleep "$poll_interval" - done - - log error "❌ Timed out after ${timeout_seconds}s waiting for ALB '$alb_name' to become active" - log error "" - log error "💡 Possible causes:" - log error " The AWS Load Balancer Controller may be slow, mis-configured, or the AWS account may be hitting an ALB quota" - log error "" - log error "🔧 How to fix:" - log error " • Check controller logs: kubectl -n kube-system logs deploy/aws-load-balancer-controller" - log error " • Verify ALB quota: aws service-quotas get-service-quota --service-code elasticloadbalancing --quota-code L-53DA6B97" - log error " • Inspect the dummy Ingress: kubectl get ingress -A -l alb_name=$alb_name" - return 1 -} - -# Tags the ALB so resolve_balancer can discover it in subsequent scope creations. -tag_alb() { - local alb_arn="$1" - local scope_id="$2" - local visibility="$3" - - aws elbv2 add-tags \ - --resource-arns "$alb_arn" \ - --region "$REGION" \ - --tags \ - "Key=nullplatform:managed-by,Value=autocreate" \ - "Key=nullplatform:visibility,Value=$visibility" \ - "Key=nullplatform:created-by-scope-id,Value=$scope_id" \ - >/dev/null 2>&1 || { - log warn "⚠️ Could not tag ALB; subsequent discovery may not find it" - return 1 - } - - log debug "📋 Tagged ALB with nullplatform:managed-by=autocreate" - return 0 + log debug "📝 Rendered dummy ingress to $out_path" } # ============================================================================= @@ -179,40 +155,12 @@ NAME_PREFIX=$(get_config_value \ --default "nullplatform-auto-" ) -TIMEOUT_SECONDS=$(get_config_value \ - --env ALB_AUTOCREATE_TIMEOUT_SECONDS \ - --provider '.providers["container-orchestration"].balancer.autocreate_timeout_seconds' \ - --default "300" -) - -if ! [[ "$TIMEOUT_SECONDS" =~ ^[1-9][0-9]*$ ]]; then - log error "❌ ALB_AUTOCREATE_TIMEOUT_SECONDS must be a positive integer, got: '$TIMEOUT_SECONDS'" - return 1 -fi - -# Final ALB name is "-<6 hex>", which must satisfy AWS -# ALB naming rules ([a-zA-Z0-9-], ≤32 chars) and the k8s metadata.name rules -# used by the dummy Ingress. 14 = length of "private-" + 6 hex chars. -if ! [[ "$NAME_PREFIX" =~ ^[a-z0-9-]+$ ]]; then - log error "❌ ALB_AUTOCREATE_NAME_PREFIX must match ^[a-z0-9-]+$, got: '$NAME_PREFIX'" - return 1 -fi -if [ "${#NAME_PREFIX}" -gt 18 ]; then - log error "❌ ALB_AUTOCREATE_NAME_PREFIX must be ≤18 chars (AWS caps ALB names at 32, the visibility+hex suffix uses 14); got ${#NAME_PREFIX}" - return 1 -fi - -SCOPE_ID=$(echo "$CONTEXT" | jq -r '.scope.id // "unknown"') - NEW_ALB_NAME=$(generate_alb_name "$NAME_PREFIX" "$INGRESS_VISIBILITY") log info "🔧 Autocreating ALB '$NEW_ALB_NAME' (visibility=$INGRESS_VISIBILITY)" -if ! apply_dummy_ingress "$NEW_ALB_NAME" "$INGRESS_VISIBILITY" "$K8S_NAMESPACE"; then - return 1 -fi +register_alb_in_provider "$NEW_ALB_NAME" "$INGRESS_VISIBILITY" -if ! wait_for_alb_active "$NEW_ALB_NAME" "$TIMEOUT_SECONDS" "$SCOPE_ID" "$INGRESS_VISIBILITY"; then - return 1 -fi +render_dummy_ingress "$NEW_ALB_NAME" "$INGRESS_VISIBILITY" "$K8S_NAMESPACE" -export AUTOCREATED_ALB_NAME="$NEW_ALB_NAME" +export ALB_NAME="$NEW_ALB_NAME" +export ALB_AUTOCREATED="true" diff --git a/k8s/scope/networking/resolve_balancer b/k8s/scope/networking/resolve_balancer index 2699f459..86278922 100755 --- a/k8s/scope/networking/resolve_balancer +++ b/k8s/scope/networking/resolve_balancer @@ -23,27 +23,48 @@ _RESOLVE_BALANCER_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -if ! type -t log >/dev/null 2>&1; then - source "$_RESOLVE_BALANCER_DIR/../../logging" -fi - -if ! type -t get_config_value >/dev/null 2>&1; then - source "$_RESOLVE_BALANCER_DIR/../../utils/get_config_value" -fi - # Queries AWS ELBv2 to count listener rules on an ALB's HTTPS (443) listener. # The default rule is excluded since it always exists. +# +# Special case for autocreate race: an ALB just registered in the provider by +# a concurrent scope creation may not yet be visible to AWS APIs while the +# Load Balancer Controller is still provisioning it. Treat +# LoadBalancerNotFound as "0 rules", so the in-flight ALB is selected as the +# least-loaded candidate and no second autocreate is triggered. +# # Usage: get_alb_rule_count # Returns: integer rule count on stdout, non-zero exit on failure get_alb_rule_count() { local alb_name="$1" - local alb_arn + local alb_arn aws_stderr aws_exit + aws_stderr=$(aws elbv2 describe-load-balancers \ + --names "$alb_name" \ + --region "$REGION" \ + --query 'LoadBalancers[0].LoadBalancerArn' \ + --output text 2>&1 >/dev/null) + aws_exit=$? + + if [ "$aws_exit" -ne 0 ]; then + case "$aws_stderr" in + *LoadBalancerNotFound*) + # Concurrent autocreate race: the ALB was just registered in the + # provider by another scope but is not yet visible to AWS APIs. Treat + # as 0 rules so the in-flight ALB wins selection and we don't trigger + # another autocreate. + log debug "📋 ALB '$alb_name' not yet visible in AWS (likely being provisioned); treating as 0 rules" >&2 + echo 0 + return 0 + ;; + esac + return 1 + fi + alb_arn=$(aws elbv2 describe-load-balancers \ --names "$alb_name" \ --region "$REGION" \ --query 'LoadBalancers[0].LoadBalancerArn' \ - --output text 2>/dev/null) || return 1 + --output text 2>/dev/null) if [ -z "$alb_arn" ] || [ "$alb_arn" = "None" ]; then return 1 @@ -124,32 +145,6 @@ get_alb_from_route53() { echo "$alb_name" } -# Lists ALB names tagged as autocreated for the given visibility. Used to keep -# previously-autocreated ALBs in the candidate pool without requiring them to -# be declared in the provider configuration. -# Returns space-separated names on stdout (may be empty), non-zero on AWS failure. -get_autocreated_albs() { - local visibility="$1" - - local arns - arns=$(aws resourcegroupstaggingapi get-resources \ - --region "$REGION" \ - --resource-type-filters "elasticloadbalancing:loadbalancer" \ - --tag-filters \ - "Key=nullplatform:managed-by,Values=autocreate" \ - "Key=nullplatform:visibility,Values=$visibility" \ - --query 'ResourceTagMappingList[].ResourceARN' \ - --output text 2>/dev/null) || return 1 - - if [ -z "$arns" ] || [ "$arns" = "None" ]; then - return 0 - fi - - # ALB ARN format: arn:aws:elasticloadbalancing:::loadbalancer/app// - # Skip non-application LBs (would have loadbalancer/net/... or loadbalancer/gwy/...). - echo "$arns" | tr '\t' '\n' | awk -F/ '$2 == "app" { print $3 }' -} - # ============================================================================= # Main logic # ============================================================================= @@ -200,26 +195,10 @@ if [[ "$DNS_TYPE" == "route53" ]]; then ) fi - # Discovered ALBs: autocreated by previous scope creations and tagged so we - # can include them in the pool without provider config changes. - AUTOCREATED_BALANCERS=$(get_autocreated_albs "$INGRESS_VISIBILITY" 2>/dev/null) || AUTOCREATED_BALANCERS="" - - HAS_ADDITIONAL=false if [ -n "$ADDITIONAL_BALANCERS" ] && [ "$ADDITIONAL_BALANCERS" != "null" ] && [ "$ADDITIONAL_BALANCERS" != "[]" ]; then - HAS_ADDITIONAL=true - fi + log debug "🔍 Additional balancers configured, resolving least-loaded ALB..." - if [ "$HAS_ADDITIONAL" = "true" ] || [ -n "$AUTOCREATED_BALANCERS" ]; then - log debug "🔍 Resolving least-loaded ALB across declared and autocreated candidates..." - - if [ "$HAS_ADDITIONAL" = "true" ]; then - DECLARED_LIST=$(echo "$ADDITIONAL_BALANCERS" | jq -r --arg base "$ALB_NAME" '[$base] + . | .[]') - else - DECLARED_LIST="$ALB_NAME" - fi - - # Merge declared + autocreated, deduplicate, drop blanks. - CANDIDATES=$(printf '%s\n%s\n' "$DECLARED_LIST" "$AUTOCREATED_BALANCERS" | tr ' ' '\n' | awk 'NF && !seen[$0]++') + CANDIDATES=$(echo "$ADDITIONAL_BALANCERS" | jq -r --arg base "$ALB_NAME" '[$base] + . | .[]') log debug "📋 Candidate balancers: $(echo "$CANDIDATES" | paste -sd ',' - | sed 's/,/, /g')" @@ -271,14 +250,8 @@ if [[ "$DNS_TYPE" == "route53" ]]; then if [ "$AUTOCREATE_ENABLED" = "true" ] && [ -n "$MAX_CAPACITY" ] && [ "$MIN_RULES" -ge 0 ] && [ "$MIN_RULES" -ge "$MAX_CAPACITY" ]; then log info "🔧 All candidate ALBs are at or above capacity ($MIN_RULES/$MAX_CAPACITY); triggering autocreate" - source "$_RESOLVE_BALANCER_DIR/autocreate_alb" || { - log error "❌ Autocreate failed; ALB '$ALB_NAME' will be used and validate_alb_capacity will reject the deployment" - return 1 - } - if [ -n "${AUTOCREATED_ALB_NAME:-}" ]; then - ALB_NAME="$AUTOCREATED_ALB_NAME" - log info "📝 Using autocreated ALB '$ALB_NAME'" - fi + # autocreate_alb exports ALB_NAME with the new name (or exits on failure) + source "$_RESOLVE_BALANCER_DIR/autocreate_alb" fi fi fi diff --git a/k8s/scope/networking/wait_for_alb b/k8s/scope/networking/wait_for_alb new file mode 100644 index 00000000..5af5c561 --- /dev/null +++ b/k8s/scope/networking/wait_for_alb @@ -0,0 +1,86 @@ +#!/bin/bash + +# Waits for the resolved ALB to reach `active` state in AWS. Runs as the post +# step of the apply_templates that lands the dummy Ingress (the trigger for +# the AWS Load Balancer Controller to provision the ALB). +# +# When ALB_AUTOCREATED is set, also tags the ALB so the cloud has a record of +# what is managed by the platform. The tag is documentation only — the +# authoritative registration is in the container-orchestration provider, done +# by autocreate_alb. +# +# Inputs (env vars): +# ALB_NAME - Resolved ALB name +# REGION - AWS region +# INGRESS_VISIBILITY - For the visibility tag value +# ALB_AUTOCREATED - "true" when this scope triggered autocreate +# ALB_AUTOCREATE_TIMEOUT_SECONDS - Max seconds to wait (default 300) +# CONTEXT - For scope-id in the tag value + +TIMEOUT_SECONDS=$(get_config_value \ + --env ALB_AUTOCREATE_TIMEOUT_SECONDS \ + --provider '.providers["container-orchestration"].balancer.autocreate_timeout_seconds' \ + --default "300" +) + +deadline=$(($(date +%s) + TIMEOUT_SECONDS)) +poll_interval=10 + +log info "⏳ Waiting up to ${TIMEOUT_SECONDS}s for ALB '$ALB_NAME' to become active..." + +state="" +alb_arn="" +while [ "$(date +%s)" -lt "$deadline" ]; do + lb_json=$(aws elbv2 describe-load-balancers \ + --names "$ALB_NAME" \ + --region "$REGION" \ + --output json 2>/dev/null) || lb_json="" + + alb_arn=$(echo "$lb_json" | jq -r '.LoadBalancers[0].LoadBalancerArn // empty' 2>/dev/null) || alb_arn="" + state=$(echo "$lb_json" | jq -r '.LoadBalancers[0].State.Code // empty' 2>/dev/null) || state="" + + if [ -n "$alb_arn" ]; then + log debug "📋 ALB '$ALB_NAME' state: ${state:-pending}" + + if [ "$state" = "active" ]; then + log info "✅ ALB '$ALB_NAME' is active" + break + fi + + if [ "$state" = "failed" ]; then + log error "❌ ALB '$ALB_NAME' reached state 'failed'" + exit 1 + fi + fi + + sleep "$poll_interval" +done + +if [ "$state" != "active" ]; then + log error "❌ Timed out after ${TIMEOUT_SECONDS}s waiting for ALB '$ALB_NAME' to become active" + log error "💡 Possible causes:" + log error " The AWS Load Balancer Controller may be slow, mis-configured, or the AWS account may be hitting an ALB quota" + log error "🔧 How to fix:" + log error " • Check controller logs: kubectl -n kube-system logs deploy/aws-load-balancer-controller" + log error " • Verify ALB quota: aws service-quotas get-service-quota --service-code elasticloadbalancing --quota-code L-53DA6B97" + exit 1 +fi + +# Audit tags — only on the scope that triggered the autocreate, so the cloud +# carries the lineage of which scope created which ALB. Discovery does not +# depend on these tags. +if [ "${ALB_AUTOCREATED:-false}" = "true" ]; then + scope_id=$(echo "$CONTEXT" | jq -r '.scope.id // "unknown"') + if ! aws elbv2 add-tags \ + --resource-arns "$alb_arn" \ + --region "$REGION" \ + --tags \ + "Key=nullplatform:managed-by,Value=autocreate" \ + "Key=nullplatform:visibility,Value=$INGRESS_VISIBILITY" \ + "Key=nullplatform:created-by-scope-id,Value=$scope_id" \ + >/dev/null 2>&1; then + log warn "⚠️ Could not tag ALB '$ALB_NAME' (audit only — provider registration already succeeded)" + else + log debug "📋 Tagged ALB '$ALB_NAME' with managed-by=autocreate" + fi +fi diff --git a/k8s/scope/templates/ingress-dummy.yaml.tpl b/k8s/scope/templates/ingress-dummy.yaml.tpl index 3d6bd3f8..2ded1e3c 100644 --- a/k8s/scope/templates/ingress-dummy.yaml.tpl +++ b/k8s/scope/templates/ingress-dummy.yaml.tpl @@ -8,21 +8,25 @@ metadata: nullplatform-autocreate: "true" alb_name: {{ .alb_name }} annotations: + alb.ingress.kubernetes.io/actions.response-404: >- + {"type":"fixed-response","fixedResponseConfig":{"contentType":"text/plain","statusCode":"404","messageBody":"404 + scope not found or has not been deployed yet"}} alb.ingress.kubernetes.io/group.name: {{ .alb_name }} alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80},{"HTTPS":443}]' alb.ingress.kubernetes.io/load-balancer-name: {{ .alb_name }} alb.ingress.kubernetes.io/scheme: {{ .ingress_visibility }} + alb.ingress.kubernetes.io/ssl-redirect: '443' alb.ingress.kubernetes.io/target-type: ip - alb.ingress.kubernetes.io/tags: nullplatform:managed-by=autocreate,nullplatform:visibility={{ .ingress_visibility }},nullplatform:created-by-scope-id={{ .scope.id }} spec: ingressClassName: alb rules: - - http: + - host: {{ .alb_name }}.{{ .base_domain }} + http: paths: - - path: /__nullplatform_autocreate_placeholder + - path: / pathType: Prefix backend: service: - name: nullplatform-autocreate-placeholder + name: response-404 port: - number: 80 + name: use-annotation diff --git a/k8s/scope/tests/networking/autocreate_alb.bats b/k8s/scope/tests/networking/autocreate_alb.bats index c5c47aa2..8ad6c257 100644 --- a/k8s/scope/tests/networking/autocreate_alb.bats +++ b/k8s/scope/tests/networking/autocreate_alb.bats @@ -17,10 +17,14 @@ setup() { export INGRESS_VISIBILITY="internet-facing" export K8S_NAMESPACE="test-ns" export SERVICE_PATH="$PROJECT_ROOT/k8s" - export ALB_AUTOCREATE_TIMEOUT_SECONDS="2" + export OUTPUT_DIR="$(mktemp -d)" export CONTEXT='{ - "scope": { "id": "scope-1", "slug": "scope-1" }, + "scope": { + "id": "scope-1", + "slug": "scope-1", + "nrn": "organization=1:account=2:namespace=3:application=4:scope=5" + }, "namespace": { "id": "ns-1", "slug": "ns-1" }, "application": { "id": "app-1", "slug": "app-1" }, "account": { "id": "acc-1", "slug": "acc-1" }, @@ -30,254 +34,222 @@ setup() { } }' - # Mocks: each test overrides as needed. - gomplate() { return 0; } - export -f gomplate - kubectl() { return 0; } - export -f kubectl - aws() { return 1; } - export -f aws - - # Tracks calls for assertions. export CALL_LOG_FILE="$(mktemp)" -} -teardown() { - unset -f log gomplate kubectl aws get_config_value - rm -f "$CALL_LOG_FILE" - unset AUTOCREATED_ALB_NAME + # Default mocks — each test overrides as needed. + gomplate() { + local prev="" + for arg in "$@"; do + if [ "$prev" = "--out" ]; then echo "rendered" > "$arg"; fi + prev="$arg" + done + return 0 + } + export -f gomplate + np() { + echo "np $*" >> "$CALL_LOG_FILE" + if [ "$1" = "provider" ] && [ "$2" = "list" ]; then + echo '{"results":[{"id":"prov-1","attributes":{"balancer":{}}}]}' + return 0 + fi + if [ "$1" = "provider" ] && [ "$2" = "patch" ]; then + local prev="" + for arg in "$@"; do + if [ "$prev" = "--body" ]; then echo "$arg" > "$OUTPUT_DIR/_patch_body"; fi + prev="$arg" + done + return 0 + fi + return 1 + } + export -f np } -# Records each invocation of a mocked binary into CALL_LOG_FILE so tests can -# assert against the sequence of calls. -record_call() { - echo "$@" >> "$CALL_LOG_FILE" +teardown() { + unset -f log gomplate np get_config_value + rm -rf "$OUTPUT_DIR" "$CALL_LOG_FILE" + unset ALB_NAME ALB_AUTOCREATED } # ============================================================================= # Name generation # ============================================================================= -@test "autocreate_alb: generates ALB name with prefix and visibility short form" { - # Mock AWS so describe-load-balancers reports active immediately. - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/nullplatform-auto-public-abc123/x","State":{"Code":"active"}}]}'; return 0 ;; - *add-tags*) return 0 ;; - *) return 1 ;; - esac - } - export -f aws - +@test "autocreate_alb: generates name with default prefix and public short form" { source "$SCRIPT" - # Length and prefix - [[ "$AUTOCREATED_ALB_NAME" =~ ^nullplatform-auto-public-[a-f0-9]{6}$ ]] + [[ "$ALB_NAME" =~ ^nullplatform-auto-public-[a-f0-9]{6}$ ]] } -@test "autocreate_alb: uses private short form for internal visibility" { +@test "autocreate_alb: generates name with private short form for internal visibility" { export INGRESS_VISIBILITY="internal" - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; - *add-tags*) return 0 ;; - *) return 1 ;; - esac - } - export -f aws source "$SCRIPT" - [[ "$AUTOCREATED_ALB_NAME" =~ ^nullplatform-auto-private-[a-f0-9]{6}$ ]] + [[ "$ALB_NAME" =~ ^nullplatform-auto-private-[a-f0-9]{6}$ ]] } -@test "autocreate_alb: respects custom name prefix from env" { - export ALB_AUTOCREATE_NAME_PREFIX="custom-prefix-" - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; - *add-tags*) return 0 ;; - *) return 1 ;; - esac - } - export -f aws +@test "autocreate_alb: respects custom name prefix" { + export ALB_AUTOCREATE_NAME_PREFIX="custom-" source "$SCRIPT" - [[ "$AUTOCREATED_ALB_NAME" =~ ^custom-prefix-public- ]] + [[ "$ALB_NAME" =~ ^custom-public-[a-f0-9]{6}$ ]] } -# ============================================================================= -# Ingress dummy application -# ============================================================================= -@test "autocreate_alb: renders and applies the dummy ingress before polling" { - gomplate() { record_call "gomplate $*"; return 0; } - export -f gomplate - kubectl() { record_call "kubectl $*"; return 0; } - export -f kubectl - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; - *add-tags*) return 0 ;; - *) return 1 ;; - esac - } - export -f aws - +@test "autocreate_alb: exports ALB_AUTOCREATED=true" { source "$SCRIPT" - # Both gomplate and kubectl must have been invoked. - grep -q "gomplate" "$CALL_LOG_FILE" - grep -q "kubectl apply" "$CALL_LOG_FILE" + [ "$ALB_AUTOCREATED" = "true" ] } -@test "autocreate_alb: fails if gomplate render fails" { - gomplate() { return 1; } - export -f gomplate +# ============================================================================= +# Provider patching +# ============================================================================= +@test "autocreate_alb: calls np provider list with the scope NRN" { + source "$SCRIPT" - run bash -c 'source "$SCRIPT"' + grep -q "provider list" "$CALL_LOG_FILE" + grep -q -- "--nrn organization=1:account=2:namespace=3:application=4:scope=5" "$CALL_LOG_FILE" +} + +@test "autocreate_alb: patches additional_public_names for internet-facing visibility" { + np() { + echo "np $*" >> "$CALL_LOG_FILE" + if [ "$1" = "provider" ] && [ "$2" = "list" ]; then + echo '{"results":[{"id":"prov-1","attributes":{"balancer":{"additional_public_names":["existing-1"]}}}]}' + return 0 + fi + if [ "$1" = "provider" ] && [ "$2" = "patch" ]; then + local prev="" + for arg in "$@"; do + if [ "$prev" = "--body" ]; then echo "$arg" > "$OUTPUT_DIR/_patch_body"; fi + prev="$arg" + done + return 0 + fi + return 1 + } + export -f np - [ "$status" -ne 0 ] - assert_contains "$output" "Failed to render ingress-dummy template" -} + source "$SCRIPT" -@test "autocreate_alb: fails if kubectl apply fails" { - kubectl() { return 1; } - export -f kubectl + local body + body=$(cat "$OUTPUT_DIR/_patch_body") + echo "$body" | jq -e '.attributes.balancer.additional_public_names | length == 2' + echo "$body" | jq -e '.attributes.balancer.additional_public_names[0] == "existing-1"' + echo "$body" | jq -e ".attributes.balancer.additional_public_names[1] == \"$ALB_NAME\"" +} - run bash -c 'source "$SCRIPT"' +@test "autocreate_alb: patches additional_private_names for internal visibility" { + export INGRESS_VISIBILITY="internal" - [ "$status" -ne 0 ] - assert_contains "$output" "Failed to apply ingress-dummy" -} + source "$SCRIPT" -# ============================================================================= -# Polling for active state -# ============================================================================= -@test "autocreate_alb: returns success when ALB becomes active within timeout" { - # describe-load-balancers returns active state immediately. - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; - *add-tags*) return 0 ;; - *) return 1 ;; - esac + cat "$OUTPUT_DIR/_patch_body" | jq -e '.attributes.balancer.additional_private_names | length == 1' +} + +@test "autocreate_alb: deduplicates when name already in list (defense in depth)" { + np() { + echo "np $*" >> "$CALL_LOG_FILE" + if [ "$1" = "provider" ] && [ "$2" = "list" ]; then + # Inject a duplicate scenario: pretend existing list already contains the same name + # (impossible in practice given random suffix, but the jq pipeline must still be safe) + echo '{"results":[{"id":"prov-1","attributes":{"balancer":{"additional_public_names":["a","b"]}}}]}' + return 0 + fi + if [ "$1" = "provider" ] && [ "$2" = "patch" ]; then + local prev="" + for arg in "$@"; do + if [ "$prev" = "--body" ]; then echo "$arg" > "$OUTPUT_DIR/_patch_body"; fi + prev="$arg" + done + return 0 + fi + return 1 } - export -f aws + export -f np source "$SCRIPT" - [ -n "$AUTOCREATED_ALB_NAME" ] + cat "$OUTPUT_DIR/_patch_body" | jq -e '.attributes.balancer.additional_public_names | length == 3' } -@test "autocreate_alb: exits non-zero when ALB never reaches active state (timeout)" { - export ALB_AUTOCREATE_TIMEOUT_SECONDS="1" - # Always return provisioning state, never 'active'. - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"provisioning"}}]}'; return 0 ;; - *) return 1 ;; - esac +@test "autocreate_alb: exits when provider list returns no results" { + np() { + if [ "$1" = "provider" ] && [ "$2" = "list" ]; then echo '{"results":[]}'; return 0; fi + return 1 } - export -f aws + export -f np run bash -c 'source "$SCRIPT"' [ "$status" -ne 0 ] - assert_contains "$output" "Timed out" + assert_contains "$output" "No container-orchestration provider found" } -@test "autocreate_alb: exits non-zero when ALB reaches 'failed' state" { - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"failed"}}]}'; return 0 ;; - *) return 1 ;; - esac +@test "autocreate_alb: exits when np provider list fails" { + np() { + if [ "$1" = "provider" ] && [ "$2" = "list" ]; then return 2; fi + return 1 } - export -f aws + export -f np run bash -c 'source "$SCRIPT"' [ "$status" -ne 0 ] - assert_contains "$output" "reached state 'failed'" -} - -# ============================================================================= -# Tagging -# ============================================================================= -@test "autocreate_alb: tags the ALB with managed-by, visibility and scope-id" { - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; - *add-tags*) - record_call "aws $*" - return 0 - ;; - *) return 1 ;; - esac - } - export -f aws - - source "$SCRIPT" - - grep -q "nullplatform:managed-by,Value=autocreate" "$CALL_LOG_FILE" - grep -q "nullplatform:visibility,Value=internet-facing" "$CALL_LOG_FILE" - grep -q "nullplatform:created-by-scope-id,Value=scope-1" "$CALL_LOG_FILE" + assert_contains "$output" "Failed to list container-orchestration provider" } -@test "autocreate_alb: tagging failure does not fail the script (warn only)" { - aws() { - case "$*" in - *describe-load-balancers*) echo '{"LoadBalancers":[{"LoadBalancerArn":"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/x/y","State":{"Code":"active"}}]}'; return 0 ;; - *add-tags*) return 1 ;; - *) return 1 ;; - esac +@test "autocreate_alb: exits when np provider patch fails" { + np() { + if [ "$1" = "provider" ] && [ "$2" = "list" ]; then + echo '{"results":[{"id":"prov-1","attributes":{"balancer":{}}}]}' + return 0 + fi + if [ "$1" = "provider" ] && [ "$2" = "patch" ]; then return 5; fi + return 1 } - export -f aws + export -f np - source "$SCRIPT" + run bash -c 'source "$SCRIPT"' - # Script still exports the new ALB name even though tagging warned. - [ -n "$AUTOCREATED_ALB_NAME" ] + [ "$status" -ne 0 ] + assert_contains "$output" "Failed to patch container-orchestration provider" } -# ============================================================================= -# Timeout validation -# ============================================================================= -@test "autocreate_alb: rejects non-numeric timeout" { - export ALB_AUTOCREATE_TIMEOUT_SECONDS="abc" +@test "autocreate_alb: exits when CONTEXT has no scope.nrn" { + export CONTEXT=$(echo "$CONTEXT" | jq 'del(.scope.nrn)') run bash -c 'source "$SCRIPT"' [ "$status" -ne 0 ] - assert_contains "$output" "must be a positive integer" + assert_contains "$output" "Could not read scope NRN" } # ============================================================================= -# Name prefix validation +# Dummy ingress rendering # ============================================================================= -@test "autocreate_alb: rejects prefix containing uppercase" { - export ALB_AUTOCREATE_NAME_PREFIX="Bad-Prefix-" - - run bash -c 'source "$SCRIPT"' +@test "autocreate_alb: renders the dummy ingress yaml to OUTPUT_DIR" { + source "$SCRIPT" - [ "$status" -ne 0 ] - assert_contains "$output" "must match" + [ -f "$OUTPUT_DIR/ingress-dummy-${ALB_NAME}.yaml" ] } -@test "autocreate_alb: rejects prefix containing colon (YAML injection vector)" { - export ALB_AUTOCREATE_NAME_PREFIX="bad:prefix" +@test "autocreate_alb: exits when gomplate fails" { + gomplate() { return 1; } + export -f gomplate run bash -c 'source "$SCRIPT"' [ "$status" -ne 0 ] - assert_contains "$output" "must match" + assert_contains "$output" "Failed to render ingress-dummy template" } -@test "autocreate_alb: rejects prefix longer than 18 chars" { - export ALB_AUTOCREATE_NAME_PREFIX="this-prefix-is-way-too-long-" +@test "autocreate_alb: exits when OUTPUT_DIR is not set" { + unset OUTPUT_DIR run bash -c 'source "$SCRIPT"' [ "$status" -ne 0 ] - assert_contains "$output" "18 chars" + assert_contains "$output" "OUTPUT_DIR is not set" } diff --git a/k8s/scope/tests/networking/resolve_balancer.bats b/k8s/scope/tests/networking/resolve_balancer.bats index c37888c1..19b29a2f 100644 --- a/k8s/scope/tests/networking/resolve_balancer.bats +++ b/k8s/scope/tests/networking/resolve_balancer.bats @@ -9,6 +9,7 @@ setup() { log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } export -f log source "$PROJECT_ROOT/k8s/utils/get_config_value" + export -f get_config_value export SCRIPT="$PROJECT_ROOT/k8s/scope/networking/resolve_balancer" export REGION="us-east-1" @@ -372,7 +373,7 @@ mock_alb_rules() { run bash -c 'export LOG_LEVEL=debug; source "$SCRIPT"' - assert_contains "$output" "🔍 Resolving least-loaded ALB across declared and autocreated candidates..." + assert_contains "$output" "🔍 Additional balancers configured, resolving least-loaded ALB..." assert_contains "$output" "📋 Candidate balancers: co-balancer-public, alb-extra-1, alb-extra-2" } @@ -503,56 +504,38 @@ mock_alb_rules() { } # ============================================================================= -# Discovery of autocreated ALBs via tags +# LoadBalancerNotFound treated as 0 rules (concurrent autocreate race case) # ============================================================================= - -# Extends mock_alb_rules to also serve a resourcegroupstaggingapi response -# listing autocreated ALBs as candidates. Pass "alb_name count" pairs as -# positional args; set DISCOVERED_AUTOCREATED (space-separated names) before -# calling to control which ALBs the tag-discovery API returns. -mock_alb_rules_with_discovery() { - mock_alb_rules "$@" - - local discovered_arns="" - for name in $DISCOVERED_AUTOCREATED; do - discovered_arns="${discovered_arns}arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc"$'\t' - done - discovered_arns="${discovered_arns%$'\t'}" - - eval "aws() { - case \"\$*\" in - *resourcegroupstaggingapi*get-resources*) - echo '${discovered_arns}' - return 0 - ;; - *list-resource-record-sets*) echo 'None' ;; - *describe-load-balancers*--names* | *describe-listeners* | *describe-rules*) - _mock_aws_elbv2_rule_count \"\$@\" - ;; - *) return 1 ;; - esac - } - export -f aws" -} - -@test "resolve_balancer: discovers autocreated ALBs from tags and includes them as candidates" { +@test "resolve_balancer: treats LoadBalancerNotFound as 0 rules so in-flight ALB wins" { export INGRESS_VISIBILITY="internet-facing" - export DISCOVERED_AUTOCREATED="nullplatform-auto-public-aaaaaa" - mock_alb_rules_with_discovery "co-balancer-public 50" "nullplatform-auto-public-aaaaaa 5" - - source "$SCRIPT" - - assert_equal "$ALB_NAME" "nullplatform-auto-public-aaaaaa" -} + export CONTEXT=$(echo "$CONTEXT" | jq ' + .providers["scope-configurations"].networking.additional_public_balancers = ["in-flight-alb"] + ') -@test "resolve_balancer: skips non-application LBs in discovery output" { - export INGRESS_VISIBILITY="internet-facing" - # Inject a network LB ARN that should be filtered out by the awk in get_autocreated_albs aws() { case "$*" in *list-resource-record-sets*) echo "None"; return 0 ;; - *resourcegroupstaggingapi*get-resources*) - printf 'arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/net/some-nlb/abc\n' + *describe-load-balancers*--names*in-flight-alb*) + echo "An error occurred (LoadBalancerNotFound) when calling the DescribeLoadBalancers operation" >&2 + return 254 + ;; + *describe-load-balancers*--names*) + echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/co-balancer-public/abc" + return 0 + ;; + *describe-listeners*) + echo "arn:aws:elasticloadbalancing:us-east-1:123:listener/app/co-balancer-public/abc/def" + return 0 + ;; + *describe-rules*) + local rules='{"Rules": [{"IsDefault": true}' + local i=0 + while [ $i -lt 50 ]; do + rules="${rules}, {\"IsDefault\": false}" + i=$((i + 1)) + done + rules="${rules}]}" + echo "$rules" return 0 ;; *) return 1 ;; @@ -562,83 +545,38 @@ mock_alb_rules_with_discovery() { source "$SCRIPT" - # No additional candidates → falls back to default - assert_equal "$ALB_NAME" "co-balancer-public" + assert_equal "$ALB_NAME" "in-flight-alb" } # ============================================================================= -# Autocreate fallback when all candidates are over capacity +# Autocreate trigger # ============================================================================= -@test "resolve_balancer: triggers autocreate when all candidates over threshold and feature enabled" { +@test "resolve_balancer: sources autocreate_alb when all candidates over threshold and feature enabled" { export INGRESS_VISIBILITY="internet-facing" export ALB_AUTOCREATE_ENABLED="true" export ALB_MAX_CAPACITY="50" - export ALB_AUTOCREATE_TIMEOUT_SECONDS="2" - export ALB_AUTOCREATE_NAME_PREFIX="auto-" - export K8S_NAMESPACE="test-ns" - export SERVICE_PATH="$PROJECT_ROOT/k8s" export CONTEXT=$(echo "$CONTEXT" | jq ' - .providers["scope-configurations"].networking.additional_public_balancers = ["alb-extra-1"] | - .scope.slug = "s" | - .scope.id = "scope-1" | - .namespace.slug = "ns" | - .namespace.id = "ns-1" | - .application.slug = "app" | - .application.id = "app-1" | - .account.slug = "acc" | - .account.id = "acc-1" | - .deployment.id = "dep-1" + .providers["scope-configurations"].networking.additional_public_balancers = ["alb-extra-1"] ') + mock_alb_rules "co-balancer-public 60" "alb-extra-1 55" - # Both declared ALBs are above threshold; aws describe for autocreate - # returns "active" immediately so the new ALB name is exported. - aws() { - case "$*" in - *list-resource-record-sets*) echo "None"; return 0 ;; - *resourcegroupstaggingapi*get-resources*) echo ""; return 0 ;; - *describe-load-balancers*--names*) - local name='' - local prev='' - for arg in "$@"; do - if [ "$prev" = "--names" ]; then name="$arg"; fi - prev="$arg" - done - # Return JSON when called with --output json (autocreate polling), - # or a plain ARN when called with --output text (rule count lookup). - case "$*" in - *--output*json*) echo "{\"LoadBalancers\":[{\"LoadBalancerArn\":\"arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc\",\"State\":{\"Code\":\"active\"}}]}"; return 0 ;; - *) echo "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/${name}/abc"; return 0 ;; - esac - ;; - *describe-listeners*) - local lb_arn='' - local prev='' - for arg in "$@"; do - if [ "$prev" = "--load-balancer-arn" ]; then lb_arn="$arg"; fi - prev="$arg" - done - local alb_name=$(echo "$lb_arn" | sed 's|.*/app/||;s|/.*||') - echo "arn:aws:elasticloadbalancing:us-east-1:123:listener/app/${alb_name}/abc/def" - return 0 - ;; - *describe-rules*) - # Both candidates report 60 rules (above threshold of 50) - echo '{"Rules":[{"IsDefault":true},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false},{"IsDefault":false}]}' - return 0 - ;; - *add-tags*) return 0 ;; - *) return 1 ;; - esac - } - export -f aws - gomplate() { return 0; } - export -f gomplate - kubectl() { return 0; } - export -f kubectl - - source "$SCRIPT" - - [[ "$ALB_NAME" =~ ^auto-public-[a-f0-9]{6}$ ]] + # Stub autocreate_alb: simulate it exporting the new ALB name. + cat > "$BATS_TEST_TMPDIR/autocreate_alb_stub" <<'STUB' +export ALB_NAME="auto-public-stubbed" +export ALB_AUTOCREATED="true" +STUB + export INGRESS_DUMMY_TEMPLATE="$BATS_TEST_TMPDIR/ignored.tpl" + # Replace the real autocreate_alb with the stub by intercepting the source path + AUTOCREATE_STUB="$BATS_TEST_TMPDIR/autocreate_alb_stub" + # The script does: source "$_RESOLVE_BALANCER_DIR/autocreate_alb" + # Override via shadow path: copy resolve_balancer to tmp with patched source line. + PATCHED_SCRIPT="$BATS_TEST_TMPDIR/resolve_balancer_patched" + sed "s|\\\$_RESOLVE_BALANCER_DIR/autocreate_alb|$AUTOCREATE_STUB|" "$SCRIPT" > "$PATCHED_SCRIPT" + + source "$PATCHED_SCRIPT" + + assert_equal "$ALB_NAME" "auto-public-stubbed" + [ "$ALB_AUTOCREATED" = "true" ] } @test "resolve_balancer: does not autocreate when feature disabled even if all candidates full" { diff --git a/k8s/scope/tests/networking/wait_for_alb.bats b/k8s/scope/tests/networking/wait_for_alb.bats new file mode 100644 index 00000000..9b6587dc --- /dev/null +++ b/k8s/scope/tests/networking/wait_for_alb.bats @@ -0,0 +1,140 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for networking/wait_for_alb +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + source "$PROJECT_ROOT/k8s/utils/get_config_value" + export -f get_config_value + + export SCRIPT="$PROJECT_ROOT/k8s/scope/networking/wait_for_alb" + export REGION="us-east-1" + export ALB_NAME="test-alb" + export INGRESS_VISIBILITY="internet-facing" + export ALB_AUTOCREATE_TIMEOUT_SECONDS="2" + + export CONTEXT='{ + "scope": { "id": "scope-1" }, + "providers": { "container-orchestration": {} } + }' + + export CALL_LOG_FILE="$(mktemp)" + + aws() { return 1; } + export -f aws +} + +teardown() { + unset -f log aws get_config_value + rm -f "$CALL_LOG_FILE" + unset ALB_AUTOCREATED +} + +# Mocks describe-load-balancers + add-tags. The state arg controls what the +# describe response reports. +mock_aws_state() { + local state="$1" + local arn="arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/$ALB_NAME/abc" + eval "aws() { + echo \"aws \$*\" >> '$CALL_LOG_FILE' + case \"\$*\" in + *describe-load-balancers*) + echo '{\"LoadBalancers\":[{\"LoadBalancerArn\":\"${arn}\",\"State\":{\"Code\":\"${state}\"}}]}' + return 0 + ;; + *add-tags*) + return 0 + ;; + esac + return 1 + } + export -f aws" +} + +# ============================================================================= +# Active state +# ============================================================================= +@test "wait_for_alb: returns success when ALB is already active" { + mock_aws_state "active" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -eq 0 ] + assert_contains "$output" "is active" +} + +# ============================================================================= +# Failed state +# ============================================================================= +@test "wait_for_alb: exits when ALB reaches state=failed" { + mock_aws_state "failed" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "reached state 'failed'" +} + +# ============================================================================= +# Timeout +# ============================================================================= +@test "wait_for_alb: exits when ALB never reaches active within timeout" { + export ALB_AUTOCREATE_TIMEOUT_SECONDS="1" + mock_aws_state "provisioning" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -ne 0 ] + assert_contains "$output" "Timed out" +} + +# ============================================================================= +# Tagging on autocreate +# ============================================================================= +@test "wait_for_alb: tags the ALB when ALB_AUTOCREATED=true" { + export ALB_AUTOCREATED="true" + mock_aws_state "active" + + source "$SCRIPT" + + grep -q "add-tags" "$CALL_LOG_FILE" + grep -q "nullplatform:managed-by,Value=autocreate" "$CALL_LOG_FILE" + grep -q "nullplatform:visibility,Value=internet-facing" "$CALL_LOG_FILE" + grep -q "nullplatform:created-by-scope-id,Value=scope-1" "$CALL_LOG_FILE" +} + +@test "wait_for_alb: does not tag when ALB_AUTOCREATED is unset" { + mock_aws_state "active" + + source "$SCRIPT" + + ! grep -q "add-tags" "$CALL_LOG_FILE" +} + +@test "wait_for_alb: tagging failure warns but does not fail the script" { + export ALB_AUTOCREATED="true" + local arn="arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/$ALB_NAME/abc" + eval "aws() { + echo \"aws \$*\" >> '$CALL_LOG_FILE' + case \"\$*\" in + *describe-load-balancers*) + echo '{\"LoadBalancers\":[{\"LoadBalancerArn\":\"${arn}\",\"State\":{\"Code\":\"active\"}}]}' + return 0 + ;; + *add-tags*) return 1 ;; + esac + return 1 + } + export -f aws" + + run bash -c 'source "$SCRIPT"' + + [ "$status" -eq 0 ] + assert_contains "$output" "audit only" +} diff --git a/k8s/scope/workflows/create.yaml b/k8s/scope/workflows/create.yaml index 9c0f3006..8ea24684 100644 --- a/k8s/scope/workflows/create.yaml +++ b/k8s/scope/workflows/create.yaml @@ -22,6 +22,20 @@ steps: type: environment - name: OUTPUT_DIR type: environment + - name: ALB_NAME + type: environment + - name: ALB_AUTOCREATED + type: environment + - name: apply autocreated ingress + type: script + file: "$SERVICE_PATH/apply_templates" + configuration: + ACTION: apply + DRY_RUN: false + post: + name: wait for alb + type: script + file: "$SERVICE_PATH/scope/networking/wait_for_alb" - name: validate alb capacity type: script file: "$SERVICE_PATH/scope/validate_alb_capacity" From b0b1cc1b660345de67624de10373a5f36ac49348 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Mon, 8 Jun 2026 15:03:11 -0300 Subject: [PATCH 04/14] docs: clarify get_alb_rule_count stdout contract in race-case branch --- k8s/scope/networking/resolve_balancer | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/k8s/scope/networking/resolve_balancer b/k8s/scope/networking/resolve_balancer index 86278922..95b0cae6 100755 --- a/k8s/scope/networking/resolve_balancer +++ b/k8s/scope/networking/resolve_balancer @@ -49,9 +49,10 @@ get_alb_rule_count() { case "$aws_stderr" in *LoadBalancerNotFound*) # Concurrent autocreate race: the ALB was just registered in the - # provider by another scope but is not yet visible to AWS APIs. Treat - # as 0 rules so the in-flight ALB wins selection and we don't trigger - # another autocreate. + # provider by another scope but is not yet visible to AWS APIs. Return + # 0 rules on stdout (same contract as the success path below) so the + # in-flight ALB wins least-loaded selection and we don't trigger a + # second autocreate. log debug "📋 ALB '$alb_name' not yet visible in AWS (likely being provisioned); treating as 0 rules" >&2 echo 0 return 0 From f5026d2d1549b3614fcd3c43781e0bfd57af40b6 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Tue, 9 Jun 2026 16:18:07 -0300 Subject: [PATCH 05/14] refactor: reuse $DOMAIN from build_context instead of recomputing --- k8s/scope/build_context | 4 +++- k8s/scope/networking/autocreate_alb | 12 +++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/k8s/scope/build_context b/k8s/scope/build_context index 548ca319..0650f897 100755 --- a/k8s/scope/build_context +++ b/k8s/scope/build_context @@ -231,10 +231,12 @@ CONTEXT=$(echo "$CONTEXT" | jq \ --arg gateway_name "$GATEWAY_NAME" \ --arg alb_name "$ALB_NAME" \ --arg component "$COMPONENT" \ + --arg base_domain "$DOMAIN" \ --argjson modifiers "$K8S_MODIFIERS" \ - '. + {ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, gateway_name: $gateway_name, region: $region, k8s_modifiers: $modifiers, alb_name: $alb_name, component: $component}') + '. + {ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, gateway_name: $gateway_name, region: $region, k8s_modifiers: $modifiers, alb_name: $alb_name, component: $component, base_domain: $base_domain}') export CONTEXT export REGION +export DOMAIN log info "✅ Scope context built successfully" diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb index aea35f6d..0a24e88d 100644 --- a/k8s/scope/networking/autocreate_alb +++ b/k8s/scope/networking/autocreate_alb @@ -117,19 +117,13 @@ render_dummy_ingress() { tmp_context=$(mktemp) trap 'rm -f "$tmp_context"' RETURN - local base_domain - base_domain=$(get_config_value \ - --env DOMAIN \ - --provider '.providers["scope-configurations"].networking.base_domain' \ - --provider '.providers["container-orchestration"].networking.base_domain' \ - --default "nullapps.io" - ) - + # build_context already exports $DOMAIN (and adds it to CONTEXT as + # `base_domain` after we return). Inject the call-specific fields here. echo "$CONTEXT" | jq \ --arg alb_name "$alb_name" \ --arg ingress_visibility "$visibility" \ --arg k8s_namespace "$namespace" \ - --arg base_domain "$base_domain" \ + --arg base_domain "$DOMAIN" \ '. + {alb_name: $alb_name, ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, base_domain: $base_domain}' \ > "$tmp_context" From 940276f3e0c0dfd2bb3de87057ab68ff3d7ca556 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 10:13:33 -0300 Subject: [PATCH 06/14] test: assert full log messages with emojis for autocreate networking tests --- k8s/scope/networking/resolve_balancer | 5 +- .../tests/networking/autocreate_alb.bats | 140 +++++++++--------- .../tests/networking/resolve_balancer.bats | 61 +++++--- k8s/scope/tests/networking/wait_for_alb.bats | 84 +++++++---- 4 files changed, 166 insertions(+), 124 deletions(-) diff --git a/k8s/scope/networking/resolve_balancer b/k8s/scope/networking/resolve_balancer index 95b0cae6..d5214ecb 100755 --- a/k8s/scope/networking/resolve_balancer +++ b/k8s/scope/networking/resolve_balancer @@ -207,7 +207,10 @@ if [[ "$DNS_TYPE" == "route53" ]]; then BEST_ALB="$ALB_NAME" for CANDIDATE in $CANDIDATES; do - RULE_COUNT=$(get_alb_rule_count "$CANDIDATE" 2>/dev/null) || { + # No outer stderr redirect: get_alb_rule_count locally suppresses raw + # aws CLI stderr and emits only intentional `log` messages there, which + # we want visible in the operator output. + RULE_COUNT=$(get_alb_rule_count "$CANDIDATE") || { log warn "⚠️ Could not query rules for ALB '$CANDIDATE', skipping" continue } diff --git a/k8s/scope/tests/networking/autocreate_alb.bats b/k8s/scope/tests/networking/autocreate_alb.bats index 8ad6c257..f76bdd16 100644 --- a/k8s/scope/tests/networking/autocreate_alb.bats +++ b/k8s/scope/tests/networking/autocreate_alb.bats @@ -11,13 +11,16 @@ setup() { export -f log source "$PROJECT_ROOT/k8s/utils/get_config_value" + export -f get_config_value export SCRIPT="$PROJECT_ROOT/k8s/scope/networking/autocreate_alb" export REGION="us-east-1" export INGRESS_VISIBILITY="internet-facing" export K8S_NAMESPACE="test-ns" export SERVICE_PATH="$PROJECT_ROOT/k8s" + export DOMAIN="nullapps.io" export OUTPUT_DIR="$(mktemp -d)" + export PATCH_BODY_FILE="$OUTPUT_DIR/_patch_body" export CONTEXT='{ "scope": { @@ -34,8 +37,6 @@ setup() { } }' - export CALL_LOG_FILE="$(mktemp)" - # Default mocks — each test overrides as needed. gomplate() { local prev="" @@ -46,8 +47,8 @@ setup() { return 0 } export -f gomplate + np() { - echo "np $*" >> "$CALL_LOG_FILE" if [ "$1" = "provider" ] && [ "$2" = "list" ]; then echo '{"results":[{"id":"prov-1","attributes":{"balancer":{}}}]}' return 0 @@ -55,7 +56,7 @@ setup() { if [ "$1" = "provider" ] && [ "$2" = "patch" ]; then local prev="" for arg in "$@"; do - if [ "$prev" = "--body" ]; then echo "$arg" > "$OUTPUT_DIR/_patch_body"; fi + if [ "$prev" = "--body" ]; then echo "$arg" > "$PATCH_BODY_FILE"; fi prev="$arg" done return 0 @@ -63,58 +64,57 @@ setup() { return 1 } export -f np + export -f gomplate } teardown() { unset -f log gomplate np get_config_value - rm -rf "$OUTPUT_DIR" "$CALL_LOG_FILE" + rm -rf "$OUTPUT_DIR" unset ALB_NAME ALB_AUTOCREATED } # ============================================================================= -# Name generation +# Happy path — full log sequence (info logs only; debug needs LOG_LEVEL=debug) # ============================================================================= -@test "autocreate_alb: generates name with default prefix and public short form" { - source "$SCRIPT" - - [[ "$ALB_NAME" =~ ^nullplatform-auto-public-[a-f0-9]{6}$ ]] +@test "autocreate_alb: full happy-path log sequence (default prefix, public visibility)" { + run bash -c 'export LOG_LEVEL=debug; source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME ALB_AUTOCREATED=$ALB_AUTOCREATED"' + + assert_equal "$status" "0" + # First log: name generated + visibility echoed + [[ "$output" =~ "🔧 Autocreating ALB 'nullplatform-auto-public-"[a-f0-9]{6}"' (visibility=internet-facing)" ]] + # Provider patch log (field name appears explicitly) + [[ "$output" =~ "📝 Registering ALB 'nullplatform-auto-public-"[a-f0-9]{6}"' in container-orchestration provider (additional_public_names)" ]] + # Render confirmation (debug) + assert_contains "$output" "📝 Rendered dummy ingress to $OUTPUT_DIR/ingress-dummy-" + # Exports + [[ "$output" =~ "ALB_NAME=nullplatform-auto-public-"[a-f0-9]{6}" ALB_AUTOCREATED=true" ]] } -@test "autocreate_alb: generates name with private short form for internal visibility" { +@test "autocreate_alb: internal visibility selects additional_private_names field in registration log" { export INGRESS_VISIBILITY="internal" - source "$SCRIPT" + run bash -c 'source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' - [[ "$ALB_NAME" =~ ^nullplatform-auto-private-[a-f0-9]{6}$ ]] + assert_equal "$status" "0" + [[ "$output" =~ "🔧 Autocreating ALB 'nullplatform-auto-private-"[a-f0-9]{6}"' (visibility=internal)" ]] + [[ "$output" =~ "📝 Registering ALB 'nullplatform-auto-private-"[a-f0-9]{6}"' in container-orchestration provider (additional_private_names)" ]] } -@test "autocreate_alb: respects custom name prefix" { +@test "autocreate_alb: custom prefix flows into both autocreate and registration logs" { export ALB_AUTOCREATE_NAME_PREFIX="custom-" - source "$SCRIPT" - - [[ "$ALB_NAME" =~ ^custom-public-[a-f0-9]{6}$ ]] -} - -@test "autocreate_alb: exports ALB_AUTOCREATED=true" { - source "$SCRIPT" + run bash -c 'source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' - [ "$ALB_AUTOCREATED" = "true" ] + assert_equal "$status" "0" + [[ "$output" =~ "🔧 Autocreating ALB 'custom-public-"[a-f0-9]{6}"' (visibility=internet-facing)" ]] + [[ "$output" =~ "ALB_NAME=custom-public-"[a-f0-9]{6} ]] } # ============================================================================= -# Provider patching +# Provider patch shape # ============================================================================= -@test "autocreate_alb: calls np provider list with the scope NRN" { - source "$SCRIPT" - - grep -q "provider list" "$CALL_LOG_FILE" - grep -q -- "--nrn organization=1:account=2:namespace=3:application=4:scope=5" "$CALL_LOG_FILE" -} - -@test "autocreate_alb: patches additional_public_names for internet-facing visibility" { +@test "autocreate_alb: patches additional_public_names preserving existing entries" { np() { - echo "np $*" >> "$CALL_LOG_FILE" if [ "$1" = "provider" ] && [ "$2" = "list" ]; then echo '{"results":[{"id":"prov-1","attributes":{"balancer":{"additional_public_names":["existing-1"]}}}]}' return 0 @@ -122,7 +122,7 @@ teardown() { if [ "$1" = "provider" ] && [ "$2" = "patch" ]; then local prev="" for arg in "$@"; do - if [ "$prev" = "--body" ]; then echo "$arg" > "$OUTPUT_DIR/_patch_body"; fi + if [ "$prev" = "--body" ]; then echo "$arg" > "$PATCH_BODY_FILE"; fi prev="$arg" done return 0 @@ -134,7 +134,7 @@ teardown() { source "$SCRIPT" local body - body=$(cat "$OUTPUT_DIR/_patch_body") + body=$(cat "$PATCH_BODY_FILE") echo "$body" | jq -e '.attributes.balancer.additional_public_names | length == 2' echo "$body" | jq -e '.attributes.balancer.additional_public_names[0] == "existing-1"' echo "$body" | jq -e ".attributes.balancer.additional_public_names[1] == \"$ALB_NAME\"" @@ -145,22 +145,19 @@ teardown() { source "$SCRIPT" - cat "$OUTPUT_DIR/_patch_body" | jq -e '.attributes.balancer.additional_private_names | length == 1' + cat "$PATCH_BODY_FILE" | jq -e '.attributes.balancer.additional_private_names | length == 1' } -@test "autocreate_alb: deduplicates when name already in list (defense in depth)" { +@test "autocreate_alb: deduplicates name in the patched list" { np() { - echo "np $*" >> "$CALL_LOG_FILE" if [ "$1" = "provider" ] && [ "$2" = "list" ]; then - # Inject a duplicate scenario: pretend existing list already contains the same name - # (impossible in practice given random suffix, but the jq pipeline must still be safe) echo '{"results":[{"id":"prov-1","attributes":{"balancer":{"additional_public_names":["a","b"]}}}]}' return 0 fi if [ "$1" = "provider" ] && [ "$2" = "patch" ]; then local prev="" for arg in "$@"; do - if [ "$prev" = "--body" ]; then echo "$arg" > "$OUTPUT_DIR/_patch_body"; fi + if [ "$prev" = "--body" ]; then echo "$arg" > "$PATCH_BODY_FILE"; fi prev="$arg" done return 0 @@ -171,10 +168,13 @@ teardown() { source "$SCRIPT" - cat "$OUTPUT_DIR/_patch_body" | jq -e '.attributes.balancer.additional_public_names | length == 3' + cat "$PATCH_BODY_FILE" | jq -e '.attributes.balancer.additional_public_names | length == 3' } -@test "autocreate_alb: exits when provider list returns no results" { +# ============================================================================= +# Error paths — full failure log +# ============================================================================= +@test "autocreate_alb: exits with full log when provider list returns no results" { np() { if [ "$1" = "provider" ] && [ "$2" = "list" ]; then echo '{"results":[]}'; return 0; fi return 1 @@ -183,11 +183,12 @@ teardown() { run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "No container-orchestration provider found" + assert_equal "$status" "1" + [[ "$output" =~ "🔧 Autocreating ALB 'nullplatform-auto-public-"[a-f0-9]{6}"' (visibility=internet-facing)" ]] + assert_contains "$output" "❌ No container-orchestration provider found for NRN 'organization=1:account=2:namespace=3:application=4:scope=5'" } -@test "autocreate_alb: exits when np provider list fails" { +@test "autocreate_alb: exits with full log when np provider list fails" { np() { if [ "$1" = "provider" ] && [ "$2" = "list" ]; then return 2; fi return 1 @@ -196,11 +197,11 @@ teardown() { run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "Failed to list container-orchestration provider" + assert_equal "$status" "1" + assert_contains "$output" "❌ Failed to list container-orchestration provider for NRN 'organization=1:account=2:namespace=3:application=4:scope=5'" } -@test "autocreate_alb: exits when np provider patch fails" { +@test "autocreate_alb: exits with full log when np provider patch fails" { np() { if [ "$1" = "provider" ] && [ "$2" = "list" ]; then echo '{"results":[{"id":"prov-1","attributes":{"balancer":{}}}]}' @@ -213,43 +214,46 @@ teardown() { run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "Failed to patch container-orchestration provider" + assert_equal "$status" "1" + [[ "$output" =~ "📝 Registering ALB 'nullplatform-auto-public-"[a-f0-9]{6}"' in container-orchestration provider (additional_public_names)" ]] + assert_contains "$output" "❌ Failed to patch container-orchestration provider with new ALB" + assert_contains "$output" "💡 Possible causes: agent lacks write permission on the provider, or NP_TOKEN/NULLPLATFORM_API_KEY is missing" } -@test "autocreate_alb: exits when CONTEXT has no scope.nrn" { +@test "autocreate_alb: exits with full log when CONTEXT has no scope.nrn" { export CONTEXT=$(echo "$CONTEXT" | jq 'del(.scope.nrn)') run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "Could not read scope NRN" -} - -# ============================================================================= -# Dummy ingress rendering -# ============================================================================= -@test "autocreate_alb: renders the dummy ingress yaml to OUTPUT_DIR" { - source "$SCRIPT" - - [ -f "$OUTPUT_DIR/ingress-dummy-${ALB_NAME}.yaml" ] + assert_equal "$status" "1" + assert_contains "$output" "❌ Could not read scope NRN from CONTEXT — cannot patch provider" } -@test "autocreate_alb: exits when gomplate fails" { +@test "autocreate_alb: exits with full log when gomplate fails to render" { gomplate() { return 1; } export -f gomplate run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "Failed to render ingress-dummy template" + assert_equal "$status" "1" + assert_contains "$output" "❌ Failed to render ingress-dummy template" + assert_contains "$output" "📋 Template: $SERVICE_PATH/scope/templates/ingress-dummy.yaml.tpl" } -@test "autocreate_alb: exits when OUTPUT_DIR is not set" { +@test "autocreate_alb: exits with full log when OUTPUT_DIR is not set" { unset OUTPUT_DIR run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "OUTPUT_DIR is not set" + assert_equal "$status" "1" + assert_contains "$output" "❌ OUTPUT_DIR is not set — autocreate_alb must run after OUTPUT_DIR is exported" +} + +# ============================================================================= +# Side effects — rendered YAML file +# ============================================================================= +@test "autocreate_alb: renders the dummy ingress yaml file inside OUTPUT_DIR" { + source "$SCRIPT" + + [ -f "$OUTPUT_DIR/ingress-dummy-${ALB_NAME}.yaml" ] } diff --git a/k8s/scope/tests/networking/resolve_balancer.bats b/k8s/scope/tests/networking/resolve_balancer.bats index 19b29a2f..9e42c15b 100644 --- a/k8s/scope/tests/networking/resolve_balancer.bats +++ b/k8s/scope/tests/networking/resolve_balancer.bats @@ -506,7 +506,7 @@ mock_alb_rules() { # ============================================================================= # LoadBalancerNotFound treated as 0 rules (concurrent autocreate race case) # ============================================================================= -@test "resolve_balancer: treats LoadBalancerNotFound as 0 rules so in-flight ALB wins" { +@test "resolve_balancer: LoadBalancerNotFound is treated as 0 rules and the in-flight ALB wins selection" { export INGRESS_VISIBILITY="internet-facing" export CONTEXT=$(echo "$CONTEXT" | jq ' .providers["scope-configurations"].networking.additional_public_balancers = ["in-flight-alb"] @@ -543,15 +543,22 @@ mock_alb_rules() { } export -f aws - source "$SCRIPT" + run bash -c 'export LOG_LEVEL=debug; source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' - assert_equal "$ALB_NAME" "in-flight-alb" + assert_equal "$status" "0" + assert_contains "$output" "🔍 Additional balancers configured, resolving least-loaded ALB..." + assert_contains "$output" "📋 Candidate balancers: co-balancer-public, in-flight-alb" + assert_contains "$output" "📋 ALB 'co-balancer-public': 50 rules" + assert_contains "$output" "📋 ALB 'in-flight-alb' not yet visible in AWS (likely being provisioned); treating as 0 rules" + assert_contains "$output" "📋 ALB 'in-flight-alb': 0 rules" + assert_contains "$output" "📝 Selected ALB 'in-flight-alb' (0 rules) over default 'co-balancer-public'" + assert_contains "$output" "ALB_NAME=in-flight-alb" } # ============================================================================= # Autocreate trigger # ============================================================================= -@test "resolve_balancer: sources autocreate_alb when all candidates over threshold and feature enabled" { +@test "resolve_balancer: sources autocreate_alb and logs full trigger sequence when all candidates over threshold" { export INGRESS_VISIBILITY="internet-facing" export ALB_AUTOCREATE_ENABLED="true" export ALB_MAX_CAPACITY="50" @@ -560,38 +567,41 @@ mock_alb_rules() { ') mock_alb_rules "co-balancer-public 60" "alb-extra-1 55" - # Stub autocreate_alb: simulate it exporting the new ALB name. + # Stub autocreate_alb so the trigger path runs end-to-end without requiring + # gomplate / np. The stub exports the new ALB name as the real script would. cat > "$BATS_TEST_TMPDIR/autocreate_alb_stub" <<'STUB' export ALB_NAME="auto-public-stubbed" export ALB_AUTOCREATED="true" STUB - export INGRESS_DUMMY_TEMPLATE="$BATS_TEST_TMPDIR/ignored.tpl" - # Replace the real autocreate_alb with the stub by intercepting the source path - AUTOCREATE_STUB="$BATS_TEST_TMPDIR/autocreate_alb_stub" - # The script does: source "$_RESOLVE_BALANCER_DIR/autocreate_alb" - # Override via shadow path: copy resolve_balancer to tmp with patched source line. PATCHED_SCRIPT="$BATS_TEST_TMPDIR/resolve_balancer_patched" + AUTOCREATE_STUB="$BATS_TEST_TMPDIR/autocreate_alb_stub" sed "s|\\\$_RESOLVE_BALANCER_DIR/autocreate_alb|$AUTOCREATE_STUB|" "$SCRIPT" > "$PATCHED_SCRIPT" - source "$PATCHED_SCRIPT" + run bash -c "export LOG_LEVEL=debug; source '$PATCHED_SCRIPT'; echo \"ALB_NAME=\$ALB_NAME ALB_AUTOCREATED=\$ALB_AUTOCREATED\"" - assert_equal "$ALB_NAME" "auto-public-stubbed" - [ "$ALB_AUTOCREATED" = "true" ] + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'co-balancer-public': 60 rules" + assert_contains "$output" "📋 ALB 'alb-extra-1': 55 rules" + assert_contains "$output" "🔧 All candidate ALBs are at or above capacity (55/50); triggering autocreate" + assert_contains "$output" "ALB_NAME=auto-public-stubbed ALB_AUTOCREATED=true" } -@test "resolve_balancer: does not autocreate when feature disabled even if all candidates full" { +@test "resolve_balancer: does not autocreate (no trigger log) when feature disabled even if candidates full" { export INGRESS_VISIBILITY="internet-facing" export ALB_AUTOCREATE_ENABLED="false" export ALB_MAX_CAPACITY="50" export CONTEXT=$(echo "$CONTEXT" | jq ' .providers["scope-configurations"].networking.additional_public_balancers = ["alb-extra-1"] ') - # Both above threshold but autocreate disabled → keeps least-loaded mock_alb_rules "co-balancer-public 60" "alb-extra-1 55" - source "$SCRIPT" + run bash -c 'source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' - assert_equal "$ALB_NAME" "alb-extra-1" + assert_equal "$status" "0" + assert_contains "$output" "📝 Selected ALB 'alb-extra-1' (55 rules) over default 'co-balancer-public'" + assert_contains "$output" "ALB_NAME=alb-extra-1" + # No autocreate trigger log + [[ "$output" != *"triggering autocreate"* ]] } @test "resolve_balancer: does not autocreate when at least one candidate below threshold" { @@ -603,12 +613,15 @@ STUB ') mock_alb_rules "co-balancer-public 60" "alb-extra-1 10" - source "$SCRIPT" + run bash -c 'source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' - assert_equal "$ALB_NAME" "alb-extra-1" + assert_equal "$status" "0" + assert_contains "$output" "📝 Selected ALB 'alb-extra-1' (10 rules) over default 'co-balancer-public'" + assert_contains "$output" "ALB_NAME=alb-extra-1" + [[ "$output" != *"triggering autocreate"* ]] } -@test "resolve_balancer: warns and skips autocreate when ALB_MAX_CAPACITY is non-numeric" { +@test "resolve_balancer: emits full warn when ALB_MAX_CAPACITY is non-numeric and skips autocreate" { export INGRESS_VISIBILITY="internet-facing" export ALB_AUTOCREATE_ENABLED="true" export ALB_MAX_CAPACITY="not-a-number" @@ -617,8 +630,10 @@ STUB ') mock_alb_rules "co-balancer-public 60" "alb-extra-1 55" - run bash -c 'source "$SCRIPT"' + run bash -c 'source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' - [ "$status" -eq 0 ] - assert_contains "$output" "ALB_MAX_CAPACITY must be numeric" + assert_equal "$status" "0" + assert_contains "$output" "⚠️ ALB_MAX_CAPACITY must be numeric, got: 'not-a-number' — skipping autocreate evaluation" + assert_contains "$output" "ALB_NAME=alb-extra-1" + [[ "$output" != *"triggering autocreate"* ]] } diff --git a/k8s/scope/tests/networking/wait_for_alb.bats b/k8s/scope/tests/networking/wait_for_alb.bats index 9b6587dc..aa53a927 100644 --- a/k8s/scope/tests/networking/wait_for_alb.bats +++ b/k8s/scope/tests/networking/wait_for_alb.bats @@ -24,33 +24,27 @@ setup() { "providers": { "container-orchestration": {} } }' - export CALL_LOG_FILE="$(mktemp)" - aws() { return 1; } export -f aws } teardown() { unset -f log aws get_config_value - rm -f "$CALL_LOG_FILE" unset ALB_AUTOCREATED } -# Mocks describe-load-balancers + add-tags. The state arg controls what the -# describe response reports. +# Builds an aws() mock that returns the given state in a single +# describe-load-balancers --output json response. add-tags returns 0. mock_aws_state() { local state="$1" local arn="arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/$ALB_NAME/abc" eval "aws() { - echo \"aws \$*\" >> '$CALL_LOG_FILE' case \"\$*\" in *describe-load-balancers*) echo '{\"LoadBalancers\":[{\"LoadBalancerArn\":\"${arn}\",\"State\":{\"Code\":\"${state}\"}}]}' return 0 ;; - *add-tags*) - return 0 - ;; + *add-tags*) return 0 ;; esac return 1 } @@ -60,68 +54,93 @@ mock_aws_state() { # ============================================================================= # Active state # ============================================================================= -@test "wait_for_alb: returns success when ALB is already active" { +@test "wait_for_alb: success path logs full sequence when ALB is already active" { mock_aws_state "active" run bash -c 'source "$SCRIPT"' - [ "$status" -eq 0 ] - assert_contains "$output" "is active" + assert_equal "$status" "0" + assert_contains "$output" "⏳ Waiting up to 2s for ALB 'test-alb' to become active..." + assert_contains "$output" "📋 ALB 'test-alb' state: active" + assert_contains "$output" "✅ ALB 'test-alb' is active" +} + +@test "wait_for_alb: honors timeout value in the initial wait log" { + export ALB_AUTOCREATE_TIMEOUT_SECONDS="120" + mock_aws_state "active" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "⏳ Waiting up to 120s for ALB 'test-alb' to become active..." } # ============================================================================= # Failed state # ============================================================================= -@test "wait_for_alb: exits when ALB reaches state=failed" { +@test "wait_for_alb: exits with full failure log when ALB reaches state=failed" { mock_aws_state "failed" run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "reached state 'failed'" + assert_equal "$status" "1" + assert_contains "$output" "⏳ Waiting up to 2s for ALB 'test-alb' to become active..." + assert_contains "$output" "📋 ALB 'test-alb' state: failed" + assert_contains "$output" "❌ ALB 'test-alb' reached state 'failed'" } # ============================================================================= -# Timeout +# Timeout with full diagnostic log # ============================================================================= -@test "wait_for_alb: exits when ALB never reaches active within timeout" { +@test "wait_for_alb: timeout emits diagnostic causes and fix hints" { export ALB_AUTOCREATE_TIMEOUT_SECONDS="1" mock_aws_state "provisioning" run bash -c 'source "$SCRIPT"' - [ "$status" -ne 0 ] - assert_contains "$output" "Timed out" + assert_equal "$status" "1" + assert_contains "$output" "⏳ Waiting up to 1s for ALB 'test-alb' to become active..." + assert_contains "$output" "📋 ALB 'test-alb' state: provisioning" + assert_contains "$output" "❌ Timed out after 1s waiting for ALB 'test-alb' to become active" + assert_contains "$output" "💡 Possible causes:" + assert_contains "$output" " The AWS Load Balancer Controller may be slow, mis-configured, or the AWS account may be hitting an ALB quota" + assert_contains "$output" "🔧 How to fix:" + assert_contains "$output" " • Check controller logs: kubectl -n kube-system logs deploy/aws-load-balancer-controller" + assert_contains "$output" " • Verify ALB quota: aws service-quotas get-service-quota --service-code elasticloadbalancing --quota-code L-53DA6B97" } # ============================================================================= # Tagging on autocreate # ============================================================================= -@test "wait_for_alb: tags the ALB when ALB_AUTOCREATED=true" { +@test "wait_for_alb: tags ALB and logs full tag-success message when ALB_AUTOCREATED=true" { export ALB_AUTOCREATED="true" mock_aws_state "active" - source "$SCRIPT" + run bash -c 'source "$SCRIPT"' - grep -q "add-tags" "$CALL_LOG_FILE" - grep -q "nullplatform:managed-by,Value=autocreate" "$CALL_LOG_FILE" - grep -q "nullplatform:visibility,Value=internet-facing" "$CALL_LOG_FILE" - grep -q "nullplatform:created-by-scope-id,Value=scope-1" "$CALL_LOG_FILE" + assert_equal "$status" "0" + assert_contains "$output" "⏳ Waiting up to 2s for ALB 'test-alb' to become active..." + assert_contains "$output" "📋 ALB 'test-alb' state: active" + assert_contains "$output" "✅ ALB 'test-alb' is active" + assert_contains "$output" "📋 Tagged ALB 'test-alb' with managed-by=autocreate" } -@test "wait_for_alb: does not tag when ALB_AUTOCREATED is unset" { +@test "wait_for_alb: does not tag (no tag log) when ALB_AUTOCREATED is unset" { mock_aws_state "active" - source "$SCRIPT" + run bash -c 'source "$SCRIPT"' - ! grep -q "add-tags" "$CALL_LOG_FILE" + assert_equal "$status" "0" + assert_contains "$output" "✅ ALB 'test-alb' is active" + # No tagging log should appear + [[ "$output" != *"Tagged ALB"* ]] + [[ "$output" != *"Could not tag ALB"* ]] } -@test "wait_for_alb: tagging failure warns but does not fail the script" { +@test "wait_for_alb: tag failure logs full warn message but exits 0" { export ALB_AUTOCREATED="true" local arn="arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/$ALB_NAME/abc" eval "aws() { - echo \"aws \$*\" >> '$CALL_LOG_FILE' case \"\$*\" in *describe-load-balancers*) echo '{\"LoadBalancers\":[{\"LoadBalancerArn\":\"${arn}\",\"State\":{\"Code\":\"active\"}}]}' @@ -135,6 +154,7 @@ mock_aws_state() { run bash -c 'source "$SCRIPT"' - [ "$status" -eq 0 ] - assert_contains "$output" "audit only" + assert_equal "$status" "0" + assert_contains "$output" "✅ ALB 'test-alb' is active" + assert_contains "$output" "⚠️ Could not tag ALB 'test-alb' (audit only — provider registration already succeeded)" } From 33da501e28a7df1bf5b3cbee12e04ff1a98ff9c4 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 11:39:33 -0300 Subject: [PATCH 07/14] feat: suggest enabling ALB_AUTOCREATE_ENABLED in capacity error hints --- k8s/scope/tests/validate_alb_capacity.bats | 1 + k8s/scope/validate_alb_capacity | 1 + 2 files changed, 2 insertions(+) diff --git a/k8s/scope/tests/validate_alb_capacity.bats b/k8s/scope/tests/validate_alb_capacity.bats index af08defd..0878e4fe 100644 --- a/k8s/scope/tests/validate_alb_capacity.bats +++ b/k8s/scope/tests/validate_alb_capacity.bats @@ -127,6 +127,7 @@ teardown() { assert_contains "$output" "Increase ALB_MAX_CAPACITY in values.yaml or container-orchestration provider (AWS limit is 100 per listener)" assert_contains "$output" "Request an AWS service quota increase for rules per ALB listener" assert_contains "$output" "Consider using a separate ALB for additional scopes" + assert_contains "$output" "Enable ALB autocreation so the platform provisions a new ALB automatically when the pool is exhausted: set ALB_AUTOCREATE_ENABLED=true in values.yaml or in the container-orchestration provider" } @test "validate_alb_capacity: fails when over capacity" { diff --git a/k8s/scope/validate_alb_capacity b/k8s/scope/validate_alb_capacity index fc5eb50e..e3c81f0e 100755 --- a/k8s/scope/validate_alb_capacity +++ b/k8s/scope/validate_alb_capacity @@ -135,6 +135,7 @@ if [[ "$TOTAL_RULES" -ge "$ALB_MAX_CAPACITY" ]]; then log error " • Increase ALB_MAX_CAPACITY in values.yaml or container-orchestration provider (AWS limit is 100 per listener)" log error " • Request an AWS service quota increase for rules per ALB listener" log error " • Consider using a separate ALB for additional scopes" + log error " • Enable ALB autocreation so the platform provisions a new ALB automatically when the pool is exhausted: set ALB_AUTOCREATE_ENABLED=true in values.yaml or in the container-orchestration provider" log error "" exit 1 fi From 7f9435f09a4027707efaad7a782aa690723e00af Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 11:53:04 -0300 Subject: [PATCH 08/14] fix(autocreate): trigger on single-ALB setups and emit step-by-step debug logs --- k8s/scope/networking/resolve_balancer | 204 ++++++++++-------- .../tests/networking/resolve_balancer.bats | 48 ++++- 2 files changed, 157 insertions(+), 95 deletions(-) diff --git a/k8s/scope/networking/resolve_balancer b/k8s/scope/networking/resolve_balancer index d5214ecb..e2ef333f 100755 --- a/k8s/scope/networking/resolve_balancer +++ b/k8s/scope/networking/resolve_balancer @@ -150,6 +150,8 @@ get_alb_from_route53() { # Main logic # ============================================================================= +log debug "🔍 Resolving ALB for visibility=$INGRESS_VISIBILITY (DNS_TYPE=$DNS_TYPE)" + # Resolve the base ALB name from configuration ALB_NAME="k8s-nullplatform-$INGRESS_VISIBILITY" @@ -167,100 +169,122 @@ else ) fi -if [[ "$DNS_TYPE" == "route53" ]]; then - # Priority 1: Check Route53 for an existing DNS record - SCOPE_DOMAIN_VAL=$(echo "$CONTEXT" | jq -r '.scope.domain // empty') - EXISTING_ALB="" +log debug "📋 Base ALB from configuration: '$ALB_NAME'" - if [ -n "$SCOPE_DOMAIN_VAL" ]; then - EXISTING_ALB=$(get_alb_from_route53 "$SCOPE_DOMAIN_VAL" "$REGION" 2>/dev/null) || true - fi +if [[ "$DNS_TYPE" != "route53" ]]; then + log debug "📋 DNS type is '$DNS_TYPE', skipping Route53 lookup and load balancing" + export ALB_NAME + return 0 2>/dev/null || true +fi + +# Priority 1: Check Route53 for an existing DNS record (DNS/ingress consistency) +SCOPE_DOMAIN_VAL=$(echo "$CONTEXT" | jq -r '.scope.domain // empty') +EXISTING_ALB="" + +if [ -n "$SCOPE_DOMAIN_VAL" ]; then + log debug "📋 Looking up Route53 alias for domain '$SCOPE_DOMAIN_VAL'..." + EXISTING_ALB=$(get_alb_from_route53 "$SCOPE_DOMAIN_VAL" "$REGION" 2>/dev/null) || true +fi + +if [ -n "$EXISTING_ALB" ]; then + log info "📝 Using ALB '$EXISTING_ALB' from Route53 record for $SCOPE_DOMAIN_VAL" + ALB_NAME="$EXISTING_ALB" + export ALB_NAME + return 0 2>/dev/null || true +fi + +log debug "📋 No Route53 record found; evaluating candidate pool" + +# Priority 2: build candidate pool (base + any additional balancers from provider) +ADDITIONAL_BALANCERS="" +if [ "$INGRESS_VISIBILITY" = "internet-facing" ]; then + ADDITIONAL_BALANCERS=$(get_config_value \ + --provider '.providers["scope-configurations"].networking.additional_public_balancers' \ + --provider '.providers["container-orchestration"].balancer.additional_public_names' \ + --default "" + ) +else + ADDITIONAL_BALANCERS=$(get_config_value \ + --provider '.providers["scope-configurations"].networking.additional_private_balancers' \ + --provider '.providers["container-orchestration"].balancer.additional_private_names' \ + --default "" + ) +fi - if [ -n "$EXISTING_ALB" ]; then - log info "📝 Using ALB '$EXISTING_ALB' from Route53 record for $SCOPE_DOMAIN_VAL" - ALB_NAME="$EXISTING_ALB" - else - # Priority 2: If additional balancers configured, pick the least-loaded one - ADDITIONAL_BALANCERS="" - if [ "$INGRESS_VISIBILITY" = "internet-facing" ]; then - ADDITIONAL_BALANCERS=$(get_config_value \ - --provider '.providers["scope-configurations"].networking.additional_public_balancers' \ - --provider '.providers["container-orchestration"].balancer.additional_public_names' \ - --default "" - ) - else - ADDITIONAL_BALANCERS=$(get_config_value \ - --provider '.providers["scope-configurations"].networking.additional_private_balancers' \ - --provider '.providers["container-orchestration"].balancer.additional_private_names' \ - --default "" - ) - fi - - if [ -n "$ADDITIONAL_BALANCERS" ] && [ "$ADDITIONAL_BALANCERS" != "null" ] && [ "$ADDITIONAL_BALANCERS" != "[]" ]; then - log debug "🔍 Additional balancers configured, resolving least-loaded ALB..." - - CANDIDATES=$(echo "$ADDITIONAL_BALANCERS" | jq -r --arg base "$ALB_NAME" '[$base] + . | .[]') - - log debug "📋 Candidate balancers: $(echo "$CANDIDATES" | paste -sd ',' - | sed 's/,/, /g')" - - MIN_RULES=-1 - BEST_ALB="$ALB_NAME" - - for CANDIDATE in $CANDIDATES; do - # No outer stderr redirect: get_alb_rule_count locally suppresses raw - # aws CLI stderr and emits only intentional `log` messages there, which - # we want visible in the operator output. - RULE_COUNT=$(get_alb_rule_count "$CANDIDATE") || { - log warn "⚠️ Could not query rules for ALB '$CANDIDATE', skipping" - continue - } - - log debug "📋 ALB '$CANDIDATE': $RULE_COUNT rules" - - if [ "$MIN_RULES" -eq -1 ] || [ "$RULE_COUNT" -lt "$MIN_RULES" ]; then - MIN_RULES=$RULE_COUNT - BEST_ALB="$CANDIDATE" - fi - done - - if [ "$BEST_ALB" != "$ALB_NAME" ]; then - log info "📝 Selected ALB '$BEST_ALB' ($MIN_RULES rules) over default '$ALB_NAME'" - fi - - ALB_NAME="$BEST_ALB" - - # Autocreate fallback: every candidate is at or above the threshold and - # autocreate is enabled. Delegate to autocreate_alb to provision a new - # ALB and replace ALB_NAME with it. - AUTOCREATE_ENABLED=$(get_config_value \ - --env ALB_AUTOCREATE_ENABLED \ - --provider '.providers["container-orchestration"].balancer.autocreate_enabled' \ - --default "false" - ) - - MAX_CAPACITY=$(get_config_value \ - --env ALB_MAX_CAPACITY \ - --provider '.providers["scope-configurations"].networking.alb_max_capacity' \ - --provider '.providers["container-orchestration"].balancer.alb_capacity_threshold' \ - --default "75" - ) - - # Without this guard a non-numeric MAX_CAPACITY would silently disable the - # autocreate trigger (the `-ge` comparison errors out and evaluates false). - if ! [[ "$MAX_CAPACITY" =~ ^[0-9]+$ ]]; then - log warn "⚠️ ALB_MAX_CAPACITY must be numeric, got: '$MAX_CAPACITY' — skipping autocreate evaluation" - MAX_CAPACITY="" - fi - - if [ "$AUTOCREATE_ENABLED" = "true" ] && [ -n "$MAX_CAPACITY" ] && [ "$MIN_RULES" -ge 0 ] && [ "$MIN_RULES" -ge "$MAX_CAPACITY" ]; then - log info "🔧 All candidate ALBs are at or above capacity ($MIN_RULES/$MAX_CAPACITY); triggering autocreate" - # autocreate_alb exports ALB_NAME with the new name (or exits on failure) - source "$_RESOLVE_BALANCER_DIR/autocreate_alb" - fi - fi +if [ -n "$ADDITIONAL_BALANCERS" ] && [ "$ADDITIONAL_BALANCERS" != "null" ] && [ "$ADDITIONAL_BALANCERS" != "[]" ]; then + CANDIDATES=$(echo "$ADDITIONAL_BALANCERS" | jq -r --arg base "$ALB_NAME" '[$base] + . | .[]') + log debug "📋 Candidate balancers (base + additional): $(echo "$CANDIDATES" | paste -sd ',' - | sed 's/,/, /g')" +else + CANDIDATES="$ALB_NAME" + log debug "📋 No additional balancers configured; candidate pool is just the base ALB '$ALB_NAME'" +fi + +# Pick least-loaded candidate. We evaluate every candidate (including the base +# alone) so capacity + autocreate decisions can fire on single-ALB setups too. +MIN_RULES=-1 +BEST_ALB="$ALB_NAME" + +for CANDIDATE in $CANDIDATES; do + # No outer stderr redirect: get_alb_rule_count locally suppresses raw aws + # CLI stderr and emits only intentional `log` messages there, which we want + # visible in the operator output. + RULE_COUNT=$(get_alb_rule_count "$CANDIDATE") || { + log warn "⚠️ Could not query rules for ALB '$CANDIDATE', skipping" + continue + } + + log debug "📋 ALB '$CANDIDATE': $RULE_COUNT rules" + + if [ "$MIN_RULES" -eq -1 ] || [ "$RULE_COUNT" -lt "$MIN_RULES" ]; then + MIN_RULES=$RULE_COUNT + BEST_ALB="$CANDIDATE" fi +done + +if [ "$BEST_ALB" != "$ALB_NAME" ]; then + log info "📝 Selected ALB '$BEST_ALB' ($MIN_RULES rules) over default '$ALB_NAME'" else - log debug "📋 DNS type is '$DNS_TYPE', skipping Route53 lookup and load balancing" + log debug "📋 Sticking with base ALB '$BEST_ALB' ($MIN_RULES rules)" +fi + +ALB_NAME="$BEST_ALB" + +# Autocreate evaluation: when the chosen ALB is at or above the capacity +# threshold and autocreate is enabled, source autocreate_alb to provision a +# new ALB and replace ALB_NAME with it. +AUTOCREATE_ENABLED=$(get_config_value \ + --env ALB_AUTOCREATE_ENABLED \ + --provider '.providers["container-orchestration"].balancer.autocreate_enabled' \ + --default "false" +) + +MAX_CAPACITY=$(get_config_value \ + --env ALB_MAX_CAPACITY \ + --provider '.providers["scope-configurations"].networking.alb_max_capacity' \ + --provider '.providers["container-orchestration"].balancer.alb_capacity_threshold' \ + --default "75" +) + +log debug "📋 Autocreate enabled: $AUTOCREATE_ENABLED | Capacity threshold: $MAX_CAPACITY" + +# Without this guard a non-numeric MAX_CAPACITY would silently disable the +# autocreate trigger (the `-ge` comparison errors out and evaluates false). +if ! [[ "$MAX_CAPACITY" =~ ^[0-9]+$ ]]; then + log warn "⚠️ ALB_MAX_CAPACITY must be numeric, got: '$MAX_CAPACITY' — skipping autocreate evaluation" + export ALB_NAME + return 0 2>/dev/null || true +fi + +if [ "$MIN_RULES" -lt 0 ]; then + log debug "📋 Could not determine rule counts for any candidate (AWS query failures); skipping autocreate evaluation" +elif [ "$MIN_RULES" -lt "$MAX_CAPACITY" ]; then + log debug "📋 Best candidate ALB '$BEST_ALB' is within capacity ($MIN_RULES/$MAX_CAPACITY); autocreate not needed" +elif [ "$AUTOCREATE_ENABLED" != "true" ]; then + log debug "📋 Best candidate ALB '$BEST_ALB' is over capacity ($MIN_RULES/$MAX_CAPACITY) but autocreate is disabled; validate_alb_capacity will reject the deployment" +else + log info "🔧 Best candidate ALB '$BEST_ALB' is at or above capacity ($MIN_RULES/$MAX_CAPACITY); triggering autocreate" + # autocreate_alb exports ALB_NAME with the new name (or exits on failure) + source "$_RESOLVE_BALANCER_DIR/autocreate_alb" fi export ALB_NAME diff --git a/k8s/scope/tests/networking/resolve_balancer.bats b/k8s/scope/tests/networking/resolve_balancer.bats index 9e42c15b..36243c4c 100644 --- a/k8s/scope/tests/networking/resolve_balancer.bats +++ b/k8s/scope/tests/networking/resolve_balancer.bats @@ -373,8 +373,8 @@ mock_alb_rules() { run bash -c 'export LOG_LEVEL=debug; source "$SCRIPT"' - assert_contains "$output" "🔍 Additional balancers configured, resolving least-loaded ALB..." - assert_contains "$output" "📋 Candidate balancers: co-balancer-public, alb-extra-1, alb-extra-2" + assert_contains "$output" "🔍 Resolving ALB for visibility=internet-facing (DNS_TYPE=route53)" + assert_contains "$output" "📋 Candidate balancers (base + additional): co-balancer-public, alb-extra-1, alb-extra-2" } # ============================================================================= @@ -546,8 +546,8 @@ mock_alb_rules() { run bash -c 'export LOG_LEVEL=debug; source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' assert_equal "$status" "0" - assert_contains "$output" "🔍 Additional balancers configured, resolving least-loaded ALB..." - assert_contains "$output" "📋 Candidate balancers: co-balancer-public, in-flight-alb" + assert_contains "$output" "🔍 Resolving ALB for visibility=internet-facing (DNS_TYPE=route53)" + assert_contains "$output" "📋 Candidate balancers (base + additional): co-balancer-public, in-flight-alb" assert_contains "$output" "📋 ALB 'co-balancer-public': 50 rules" assert_contains "$output" "📋 ALB 'in-flight-alb' not yet visible in AWS (likely being provisioned); treating as 0 rules" assert_contains "$output" "📋 ALB 'in-flight-alb': 0 rules" @@ -582,7 +582,7 @@ STUB assert_equal "$status" "0" assert_contains "$output" "📋 ALB 'co-balancer-public': 60 rules" assert_contains "$output" "📋 ALB 'alb-extra-1': 55 rules" - assert_contains "$output" "🔧 All candidate ALBs are at or above capacity (55/50); triggering autocreate" + assert_contains "$output" "🔧 Best candidate ALB 'alb-extra-1' is at or above capacity (55/50); triggering autocreate" assert_contains "$output" "ALB_NAME=auto-public-stubbed ALB_AUTOCREATED=true" } @@ -621,6 +621,44 @@ STUB [[ "$output" != *"triggering autocreate"* ]] } +@test "resolve_balancer: triggers autocreate on single-ALB setup (no additional balancers) when over capacity" { + export INGRESS_VISIBILITY="internet-facing" + export ALB_AUTOCREATE_ENABLED="true" + export ALB_MAX_CAPACITY="10" + # No additional_public_balancers — pool is just the base ALB + mock_alb_rules "co-balancer-public 16" + + cat > "$BATS_TEST_TMPDIR/autocreate_alb_stub" <<'STUB' +export ALB_NAME="auto-public-stubbed" +export ALB_AUTOCREATED="true" +STUB + PATCHED_SCRIPT="$BATS_TEST_TMPDIR/resolve_balancer_patched" + AUTOCREATE_STUB="$BATS_TEST_TMPDIR/autocreate_alb_stub" + sed "s|\\\$_RESOLVE_BALANCER_DIR/autocreate_alb|$AUTOCREATE_STUB|" "$SCRIPT" > "$PATCHED_SCRIPT" + + run bash -c "export LOG_LEVEL=debug; source '$PATCHED_SCRIPT'; echo \"ALB_NAME=\$ALB_NAME ALB_AUTOCREATED=\$ALB_AUTOCREATED\"" + + assert_equal "$status" "0" + assert_contains "$output" "📋 No additional balancers configured; candidate pool is just the base ALB 'co-balancer-public'" + assert_contains "$output" "📋 ALB 'co-balancer-public': 16 rules" + assert_contains "$output" "🔧 Best candidate ALB 'co-balancer-public' is at or above capacity (16/10); triggering autocreate" + assert_contains "$output" "ALB_NAME=auto-public-stubbed ALB_AUTOCREATED=true" +} + +@test "resolve_balancer: single-ALB setup over capacity but autocreate disabled logs reason and lets validate reject" { + export INGRESS_VISIBILITY="internet-facing" + export ALB_AUTOCREATE_ENABLED="false" + export ALB_MAX_CAPACITY="10" + mock_alb_rules "co-balancer-public 16" + + run bash -c 'export LOG_LEVEL=debug; source "$SCRIPT"; echo "ALB_NAME=$ALB_NAME"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'co-balancer-public': 16 rules" + assert_contains "$output" "📋 Best candidate ALB 'co-balancer-public' is over capacity (16/10) but autocreate is disabled; validate_alb_capacity will reject the deployment" + assert_contains "$output" "ALB_NAME=co-balancer-public" +} + @test "resolve_balancer: emits full warn when ALB_MAX_CAPACITY is non-numeric and skips autocreate" { export INGRESS_VISIBILITY="internet-facing" export ALB_AUTOCREATE_ENABLED="true" From dff4e05c24f107c0b8e66f57354e7de724a0f38e Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 12:04:50 -0300 Subject: [PATCH 09/14] fix: render gomplate context as .json and emit heartbeat every 30s while waiting --- k8s/scope/networking/autocreate_alb | 13 +++++++----- k8s/scope/networking/wait_for_alb | 13 +++++++++++- k8s/scope/tests/networking/wait_for_alb.bats | 21 ++++++++++++++++++++ 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb index 0a24e88d..0c2180c4 100644 --- a/k8s/scope/networking/autocreate_alb +++ b/k8s/scope/networking/autocreate_alb @@ -113,9 +113,12 @@ render_dummy_ingress() { fi mkdir -p "$OUTPUT_DIR" - local tmp_context - tmp_context=$(mktemp) - trap 'rm -f "$tmp_context"' RETURN + # The context file MUST have a .json extension. gomplate uses the extension + # to pick the parser; a plain mktemp path is treated as an opaque string + # and the template fails with "can't evaluate field X in type string". + local context_path + context_path="$OUTPUT_DIR/ingress-dummy-${alb_name}-context.json" + trap 'rm -f "$context_path"' RETURN # build_context already exports $DOMAIN (and adds it to CONTEXT as # `base_domain` after we return). Inject the call-specific fields here. @@ -125,12 +128,12 @@ render_dummy_ingress() { --arg k8s_namespace "$namespace" \ --arg base_domain "$DOMAIN" \ '. + {alb_name: $alb_name, ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, base_domain: $base_domain}' \ - > "$tmp_context" + > "$context_path" local template_path="${INGRESS_DUMMY_TEMPLATE:-$SERVICE_PATH/scope/templates/ingress-dummy.yaml.tpl}" local out_path="$OUTPUT_DIR/ingress-dummy-${alb_name}.yaml" - if ! gomplate -c .="$tmp_context" --file "$template_path" --out "$out_path"; then + if ! gomplate -c .="$context_path" --file "$template_path" --out "$out_path"; then log error "❌ Failed to render ingress-dummy template" log error "📋 Template: $template_path" exit 1 diff --git a/k8s/scope/networking/wait_for_alb b/k8s/scope/networking/wait_for_alb index 5af5c561..d4aa08bc 100644 --- a/k8s/scope/networking/wait_for_alb +++ b/k8s/scope/networking/wait_for_alb @@ -23,8 +23,11 @@ TIMEOUT_SECONDS=$(get_config_value \ --default "300" ) -deadline=$(($(date +%s) + TIMEOUT_SECONDS)) +started_at=$(date +%s) +deadline=$((started_at + TIMEOUT_SECONDS)) poll_interval=10 +heartbeat_interval=30 +last_heartbeat="$started_at" log info "⏳ Waiting up to ${TIMEOUT_SECONDS}s for ALB '$ALB_NAME' to become active..." @@ -53,6 +56,14 @@ while [ "$(date +%s)" -lt "$deadline" ]; do fi fi + # Heartbeat every ~30s so the operator sees progress on long waits instead + # of staring at a frozen line for the full timeout. + now=$(date +%s) + if [ $((now - last_heartbeat)) -ge "$heartbeat_interval" ]; then + log info "⏳ Still waiting for ALB '$ALB_NAME' to become active (${state:-pending}, $((now - started_at))s elapsed)" + last_heartbeat="$now" + fi + sleep "$poll_interval" done diff --git a/k8s/scope/tests/networking/wait_for_alb.bats b/k8s/scope/tests/networking/wait_for_alb.bats index aa53a927..4545f24a 100644 --- a/k8s/scope/tests/networking/wait_for_alb.bats +++ b/k8s/scope/tests/networking/wait_for_alb.bats @@ -109,6 +109,27 @@ mock_aws_state() { assert_contains "$output" " • Verify ALB quota: aws service-quotas get-service-quota --service-code elasticloadbalancing --quota-code L-53DA6B97" } +# ============================================================================= +# Heartbeat +# ============================================================================= +@test "wait_for_alb: emits heartbeat info log when the wait crosses the threshold" { + # Shrink both the poll interval and the heartbeat threshold so the test + # exercises the heartbeat path without sitting through real 30s intervals. + PATCHED_SCRIPT="$BATS_TEST_TMPDIR/wait_for_alb_patched" + sed -e 's/^poll_interval=10$/poll_interval=1/' \ + -e 's/^heartbeat_interval=30$/heartbeat_interval=1/' \ + "$SCRIPT" > "$PATCHED_SCRIPT" + + export ALB_AUTOCREATE_TIMEOUT_SECONDS="3" + mock_aws_state "provisioning" + + run bash -c "source '$PATCHED_SCRIPT'" + + # Times out as expected, but we should see at least one heartbeat info log. + assert_equal "$status" "1" + assert_contains "$output" "⏳ Still waiting for ALB 'test-alb' to become active (provisioning," +} + # ============================================================================= # Tagging on autocreate # ============================================================================= From 8a4c12f898750e574cf7db3f3e01080c22c1ece4 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 13:30:45 -0300 Subject: [PATCH 10/14] fix: generate dummy ingress host via domain-generate so it matches the platform cert --- k8s/scope/networking/autocreate_alb | 42 +++++++++++++++++-- k8s/scope/templates/ingress-dummy.yaml.tpl | 6 +-- .../tests/networking/autocreate_alb.bats | 31 ++++++++++++++ 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb index 0c2180c4..517e606d 100644 --- a/k8s/scope/networking/autocreate_alb +++ b/k8s/scope/networking/autocreate_alb @@ -102,6 +102,38 @@ register_alb_in_provider() { fi } +# Builds the dummy ingress host via the same domain-generate binary the +# platform uses for scope domains. Substituting scopeSlug with the ALB name +# keeps the host inside whatever wildcard cert/DNS pattern the platform +# already maintains. +generate_dummy_host() { + local alb_name="$1" + + local account_slug namespace_slug application_slug + account_slug=$(echo "$CONTEXT" | jq -r '.account.slug') + namespace_slug=$(echo "$CONTEXT" | jq -r '.namespace.slug') + application_slug=$(echo "$CONTEXT" | jq -r '.application.slug') + + local host + host=$("$SERVICE_PATH/scope/networking/dns/domain/domain-generate" \ + --accountSlug="$account_slug" \ + --namespaceSlug="$namespace_slug" \ + --applicationSlug="$application_slug" \ + --scopeSlug="$alb_name" \ + --domain="$DOMAIN" \ + --useAccountSlug="${USE_ACCOUNT_SLUG:-false}") || { + log error "❌ Failed to generate dummy ingress host via domain-generate" + log error "💡 Possible causes:" + log error " The domain-generate binary returned an error" + log error "🔧 How to fix:" + log error " • Check the domain-generate binary exists: ls -la $SERVICE_PATH/scope/networking/dns/domain/domain-generate" + log error " • Verify the input slugs are valid" + exit 1 + } + + echo "$host" +} + render_dummy_ingress() { local alb_name="$1" local visibility="$2" @@ -113,6 +145,10 @@ render_dummy_ingress() { fi mkdir -p "$OUTPUT_DIR" + local dummy_host + dummy_host=$(generate_dummy_host "$alb_name") + log debug "📋 Dummy ingress host: $dummy_host" + # The context file MUST have a .json extension. gomplate uses the extension # to pick the parser; a plain mktemp path is treated as an opaque string # and the template fails with "can't evaluate field X in type string". @@ -120,14 +156,12 @@ render_dummy_ingress() { context_path="$OUTPUT_DIR/ingress-dummy-${alb_name}-context.json" trap 'rm -f "$context_path"' RETURN - # build_context already exports $DOMAIN (and adds it to CONTEXT as - # `base_domain` after we return). Inject the call-specific fields here. echo "$CONTEXT" | jq \ --arg alb_name "$alb_name" \ --arg ingress_visibility "$visibility" \ --arg k8s_namespace "$namespace" \ - --arg base_domain "$DOMAIN" \ - '. + {alb_name: $alb_name, ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, base_domain: $base_domain}' \ + --arg dummy_host "$dummy_host" \ + '. + {alb_name: $alb_name, ingress_visibility: $ingress_visibility, k8s_namespace: $k8s_namespace, dummy_host: $dummy_host}' \ > "$context_path" local template_path="${INGRESS_DUMMY_TEMPLATE:-$SERVICE_PATH/scope/templates/ingress-dummy.yaml.tpl}" diff --git a/k8s/scope/templates/ingress-dummy.yaml.tpl b/k8s/scope/templates/ingress-dummy.yaml.tpl index 2ded1e3c..cff2e154 100644 --- a/k8s/scope/templates/ingress-dummy.yaml.tpl +++ b/k8s/scope/templates/ingress-dummy.yaml.tpl @@ -8,9 +8,7 @@ metadata: nullplatform-autocreate: "true" alb_name: {{ .alb_name }} annotations: - alb.ingress.kubernetes.io/actions.response-404: >- - {"type":"fixed-response","fixedResponseConfig":{"contentType":"text/plain","statusCode":"404","messageBody":"404 - scope not found or has not been deployed yet"}} + alb.ingress.kubernetes.io/actions.response-404: '{"type":"fixed-response","fixedResponseConfig":{"contentType":"text/plain","statusCode":"404","messageBody":"404 scope not found or has not been deployed yet"}}' alb.ingress.kubernetes.io/group.name: {{ .alb_name }} alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80},{"HTTPS":443}]' alb.ingress.kubernetes.io/load-balancer-name: {{ .alb_name }} @@ -20,7 +18,7 @@ metadata: spec: ingressClassName: alb rules: - - host: {{ .alb_name }}.{{ .base_domain }} + - host: {{ .dummy_host }} http: paths: - path: / diff --git a/k8s/scope/tests/networking/autocreate_alb.bats b/k8s/scope/tests/networking/autocreate_alb.bats index f76bdd16..a54111be 100644 --- a/k8s/scope/tests/networking/autocreate_alb.bats +++ b/k8s/scope/tests/networking/autocreate_alb.bats @@ -257,3 +257,34 @@ teardown() { [ -f "$OUTPUT_DIR/ingress-dummy-${ALB_NAME}.yaml" ] } + +# ============================================================================= +# Dummy host generation +# ============================================================================= +@test "autocreate_alb: derives dummy host via domain-generate using the ALB name as scopeSlug" { + # Run with the real domain-generate binary (no mock) — it's deterministic. + # Use a custom gomplate that writes the rendered output through so we can + # inspect the final host. + gomplate() { + local prev="" + for arg in "$@"; do + if [ "$prev" = "--out" ]; then OUT_PATH="$arg"; fi + if [ "$prev" = "-c" ]; then CTX_ARG="$arg"; fi + prev="$arg" + done + # Copy the context json next to the rendered file for assertion access + local ctx_file="${CTX_ARG#.=}" + cp "$ctx_file" "$OUTPUT_DIR/_rendered_context.json" + echo "rendered" > "$OUT_PATH" + return 0 + } + export -f gomplate + + source "$SCRIPT" + + # domain-generate produces "ns-app--.nullapps.io" with the + # slugs taken from CONTEXT.namespace / .application and scopeSlug=$ALB_NAME. + local dummy_host + dummy_host=$(jq -r '.dummy_host' "$OUTPUT_DIR/_rendered_context.json") + [[ "$dummy_host" =~ ^ns-1-app-1-${ALB_NAME}-[a-z0-9]+\.nullapps\.io$ ]] +} From a9dbc6c24675f13eb458f9ecc8f105d15f765497 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 13:43:37 -0300 Subject: [PATCH 11/14] fix: stable 30s heartbeat cadence and trap-safe context cleanup under set -u --- k8s/scope/networking/autocreate_alb | 6 ++++- k8s/scope/networking/wait_for_alb | 24 ++++++++++++-------- k8s/scope/tests/networking/wait_for_alb.bats | 17 ++++++++------ 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb index 517e606d..2d39adef 100644 --- a/k8s/scope/networking/autocreate_alb +++ b/k8s/scope/networking/autocreate_alb @@ -154,7 +154,11 @@ render_dummy_ingress() { # and the template fails with "can't evaluate field X in type string". local context_path context_path="$OUTPUT_DIR/ingress-dummy-${alb_name}-context.json" - trap 'rm -f "$context_path"' RETURN + # Use double quotes so $context_path is baked into the trap string now. + # Single quotes would defer expansion until the trap fires on RETURN, by + # which point the local variable is out of scope and `set -u` would trip + # with "context_path: unbound variable". + trap "rm -f '$context_path'" RETURN echo "$CONTEXT" | jq \ --arg alb_name "$alb_name" \ diff --git a/k8s/scope/networking/wait_for_alb b/k8s/scope/networking/wait_for_alb index d4aa08bc..573f1f30 100644 --- a/k8s/scope/networking/wait_for_alb +++ b/k8s/scope/networking/wait_for_alb @@ -23,11 +23,15 @@ TIMEOUT_SECONDS=$(get_config_value \ --default "300" ) -started_at=$(date +%s) -deadline=$((started_at + TIMEOUT_SECONDS)) +deadline=$(($(date +%s) + TIMEOUT_SECONDS)) poll_interval=10 -heartbeat_interval=30 -last_heartbeat="$started_at" +polls_per_heartbeat=3 +# Counter-based heartbeat: every Nth poll we emit a progress log. Using a +# counter (instead of wall-clock diff) keeps the displayed elapsed time on +# clean 30s boundaries; otherwise the latency of each AWS call drifts the +# wall-clock heartbeat to odd numbers like 33s or 63s. +polls_since_heartbeat=0 +heartbeats_emitted=0 log info "⏳ Waiting up to ${TIMEOUT_SECONDS}s for ALB '$ALB_NAME' to become active..." @@ -56,12 +60,12 @@ while [ "$(date +%s)" -lt "$deadline" ]; do fi fi - # Heartbeat every ~30s so the operator sees progress on long waits instead - # of staring at a frozen line for the full timeout. - now=$(date +%s) - if [ $((now - last_heartbeat)) -ge "$heartbeat_interval" ]; then - log info "⏳ Still waiting for ALB '$ALB_NAME' to become active (${state:-pending}, $((now - started_at))s elapsed)" - last_heartbeat="$now" + polls_since_heartbeat=$((polls_since_heartbeat + 1)) + if [ "$polls_since_heartbeat" -ge "$polls_per_heartbeat" ]; then + heartbeats_emitted=$((heartbeats_emitted + 1)) + elapsed=$((heartbeats_emitted * polls_per_heartbeat * poll_interval)) + log info "⏳ Still waiting for ALB '$ALB_NAME' to become active (${state:-pending}, ~${elapsed}s elapsed)" + polls_since_heartbeat=0 fi sleep "$poll_interval" diff --git a/k8s/scope/tests/networking/wait_for_alb.bats b/k8s/scope/tests/networking/wait_for_alb.bats index 4545f24a..6fed1b3f 100644 --- a/k8s/scope/tests/networking/wait_for_alb.bats +++ b/k8s/scope/tests/networking/wait_for_alb.bats @@ -112,22 +112,25 @@ mock_aws_state() { # ============================================================================= # Heartbeat # ============================================================================= -@test "wait_for_alb: emits heartbeat info log when the wait crosses the threshold" { - # Shrink both the poll interval and the heartbeat threshold so the test - # exercises the heartbeat path without sitting through real 30s intervals. +@test "wait_for_alb: emits heartbeat info log on clean elapsed boundaries" { + # Shrink the poll interval and the polls-per-heartbeat so the test runs + # in seconds instead of full 30s intervals. With poll_interval=1 and + # polls_per_heartbeat=2, the first heartbeat fires after 2 polls and + # the displayed elapsed is 2*1=2s. PATCHED_SCRIPT="$BATS_TEST_TMPDIR/wait_for_alb_patched" sed -e 's/^poll_interval=10$/poll_interval=1/' \ - -e 's/^heartbeat_interval=30$/heartbeat_interval=1/' \ + -e 's/^polls_per_heartbeat=3$/polls_per_heartbeat=2/' \ "$SCRIPT" > "$PATCHED_SCRIPT" - export ALB_AUTOCREATE_TIMEOUT_SECONDS="3" + export ALB_AUTOCREATE_TIMEOUT_SECONDS="5" mock_aws_state "provisioning" run bash -c "source '$PATCHED_SCRIPT'" - # Times out as expected, but we should see at least one heartbeat info log. + # Times out as expected. We should see at least one heartbeat with a clean + # elapsed multiple (2s, 4s) — never odd numbers like 3s or 5s. assert_equal "$status" "1" - assert_contains "$output" "⏳ Still waiting for ALB 'test-alb' to become active (provisioning," + assert_contains "$output" "⏳ Still waiting for ALB 'test-alb' to become active (provisioning, ~2s elapsed)" } # ============================================================================= From 257a7bcdd8c4f1106cb722fb3956be8f3373a1bb Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 16:04:27 -0300 Subject: [PATCH 12/14] fix: gate wait_for_alb on route53, validate inputs, dedupe AWS call, reconcile docs --- CHANGELOG.md | 2 +- k8s/docs/autocreate-alb.md | 54 +++++++++---------- k8s/scope/networking/autocreate_alb | 16 ++++++ k8s/scope/networking/resolve_balancer | 18 +++---- k8s/scope/networking/wait_for_alb | 15 ++++++ .../tests/networking/autocreate_alb.bats | 18 +++++++ k8s/scope/tests/networking/wait_for_alb.bats | 34 ++++++++++++ 7 files changed, 119 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dcfd3a75..14be5dfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -- Add opt-in ALB autocreation for k8s scopes: when every declared ALB is at or above `ALB_MAX_CAPACITY`, the platform provisions a new ALB via a dummy Ingress, registers it in the container-orchestration provider, and uses it for the scope. Disabled by default; enable with `ALB_AUTOCREATE_ENABLED=true`. Requires additional AWS permission: `elasticloadbalancing:AddTags` (audit tagging) +- Add opt-in ALB autocreation for k8s scopes: when every declared ALB is at or above `ALB_MAX_CAPACITY`, the platform provisions a new ALB via a dummy Ingress, registers it in the container-orchestration provider via `np provider patch`, and uses it for the scope. Disabled by default; enable with `ALB_AUTOCREATE_ENABLED=true`. Requires write access to the nullplatform API (via `NP_TOKEN` or `NULLPLATFORM_API_KEY`) and the additional AWS permission `elasticloadbalancing:AddTags` for audit tagging ## [1.12.0] - 2026-06-08 - Fix: do not inject file parameter as env vars diff --git a/k8s/docs/autocreate-alb.md b/k8s/docs/autocreate-alb.md index bac8bb3d..a800d2a0 100644 --- a/k8s/docs/autocreate-alb.md +++ b/k8s/docs/autocreate-alb.md @@ -8,7 +8,7 @@ The flow only triggers when **all** of the following are true: - `ALB_AUTOCREATE_ENABLED=true` in `values.yaml` or in the `container-orchestration` provider. - `DNS_TYPE=route53` (autocreation requires the same load-balancing path used by Route53 scopes). -- Every candidate ALB in the pool (declared base + additional balancers + previously autocreated ALBs discovered by tag) reports a rule count `>= ALB_MAX_CAPACITY`. +- Every candidate ALB in the pool (base + additional balancers declared in the provider) reports a rule count `>= ALB_MAX_CAPACITY`. - The scope being created does not already have a Route53 record (a scope being recreated reuses its existing ALB and does not trigger autocreation). If any candidate is below the threshold, the scope creation uses that candidate and the autocreate path is not taken. @@ -18,56 +18,56 @@ If any candidate is below the threshold, the scope creation uses that candidate | Key | Default | Description | |---|---|---| | `ALB_AUTOCREATE_ENABLED` | `false` | Master switch. When `false`, behavior is identical to previous releases. | -| `ALB_AUTOCREATE_NAME_PREFIX` | `nullplatform-auto-` | Prefix for autocreated ALB names. Final name format: `-<6 hex chars>`. Total length must stay below the AWS 32-character ALB name limit. | -| `ALB_AUTOCREATE_TIMEOUT_SECONDS` | `300` | How long the script polls AWS for the new ALB to reach `state=active` before failing the scope creation. The AWS Load Balancer Controller usually takes 2–4 minutes. | +| `ALB_AUTOCREATE_NAME_PREFIX` | `nullplatform-auto-` | Prefix for autocreated ALB names. Final name format: `-<6 hex chars>`. Must match `^[a-z0-9-]+$` and be ≤18 chars so the rendered name stays under the AWS 32-char ALB name limit. | +| `ALB_AUTOCREATE_TIMEOUT_SECONDS` | `300` | How long `wait_for_alb` polls AWS for the new ALB to reach `state=active` before failing the scope creation. The AWS Load Balancer Controller usually takes 2–4 minutes. Must be a positive integer. | All three keys are also readable from `providers.container-orchestration.balancer.{autocreate_enabled, autocreate_name_prefix, autocreate_timeout_seconds}`. ## How it works -1. `resolve_balancer` evaluates the candidate pool (declared + tag-discovered ALBs) and picks the least-loaded one as today. +1. `resolve_balancer` evaluates the candidate pool — the base ALB plus the `additional_public_names` / `additional_private_names` list declared in the `container-orchestration` provider — and picks the least-loaded one. 2. If that candidate's rule count is at or above `ALB_MAX_CAPACITY` and `ALB_AUTOCREATE_ENABLED=true`, `resolve_balancer` sources `autocreate_alb`. -3. `autocreate_alb` generates a unique ALB name, renders `scope/templates/ingress-dummy.yaml.tpl`, and applies it. The AWS Load Balancer Controller picks up the Ingress and provisions the ALB. -4. The script polls `aws elbv2 describe-load-balancers` every 10 seconds until the ALB reports `State.Code=active` (or `failed`/timeout, in which case the scope creation fails). -5. Once active, the script tags the ALB with: - - `nullplatform:managed-by=autocreate` - - `nullplatform:visibility=internet-facing|internal` - - `nullplatform:created-by-scope-id=` -6. `resolve_balancer` substitutes the new ALB name and the rest of the scope creation proceeds. +3. `autocreate_alb` generates a unique ALB name (`-<6 hex>`) and **patches the container-orchestration provider via `np provider patch`** to append the new name to `additional_public_names` or `additional_private_names` (visibility-dependent). The provider is the authoritative registry of ALBs the platform uses. +4. `autocreate_alb` renders `scope/templates/ingress-dummy.yaml.tpl` into `$OUTPUT_DIR/ingress-dummy-.yaml`. The dummy Ingress carries `alb.ingress.kubernetes.io/group.name=` and `alb.ingress.kubernetes.io/load-balancer-name=`, which is what makes the AWS Load Balancer Controller materialize the ALB once the file is applied. +5. The workflow step `apply autocreated ingress` (in `k8s/scope/workflows/create.yaml`) applies whatever templates are in `$OUTPUT_DIR` via the standard `apply_templates` script. Its `post: wait for alb` runs `wait_for_alb`, which polls `aws elbv2 describe-load-balancers` every 10 seconds until the ALB reports `State.Code=active` (or `failed`/timeout, in which case the scope creation fails). An info-level heartbeat is emitted every ~30s so the operator can see progress. +6. Once active, `wait_for_alb` tags the ALB with `nullplatform:managed-by=autocreate`, `nullplatform:visibility=internet-facing|internal`, and `nullplatform:created-by-scope-id=`. **These tags are audit metadata only**, surfacing the lineage of which scope provisioned which ALB. Discovery does NOT depend on these tags. +7. The rest of the scope creation proceeds with `ALB_NAME` set to the new ALB. -## Discovery of previously autocreated ALBs +## How concurrent scope creations behave -Every scope creation queries `resourcegroupstaggingapi:get-resources` for ALBs tagged `nullplatform:managed-by=autocreate` matching the scope's visibility. Discovered ALBs are merged into the candidate pool without any provider configuration change, so a single autocreated ALB serves many subsequent scopes before another autocreation is needed. +When scope A triggers autocreate, the provider is patched **before** the ALB is active. Scope B that starts during this window reads the provider list, sees the new ALB name, and treats it as a normal candidate. AWS will return `LoadBalancerNotFound` for the in-flight ALB during the few seconds before it shows up in the API; `resolve_balancer` interprets that error specifically as "0 rules" so the in-flight ALB wins least-loaded selection in scope B and no second autocreate fires. Scope B then waits on the same ALB via its own `wait_for_alb` step. -Discovery runs regardless of `ALB_AUTOCREATE_ENABLED`: even if the flag is later turned off, previously autocreated ALBs remain usable. +## Required permissions -## Required AWS permissions +In addition to the permissions already required for capacity validation, the autocreate path needs: -In addition to the permissions already required for capacity validation, the agent role needs: +**Nullplatform API credentials.** The script calls `np provider list` and `np provider patch`, so the workflow environment must provide either `NP_TOKEN` or `NULLPLATFORM_API_KEY` with write access to the container-orchestration provider for the relevant NRN. Without these, the patch step fails with `❌ Failed to patch container-orchestration provider with new ALB`. -- `elasticloadbalancing:AddTags` — to tag the new ALB so discovery can find it. -- `elasticloadbalancing:DescribeTags` — for the discovery path (covered by capacity validation in most agents, listed here for completeness). -- `tag:GetResources` — for the `resourcegroupstaggingapi` call used by discovery. +**AWS IAM (agent role).** -The dummy Ingress requires no new K8s permissions beyond those the agent already has for scope resources. +- `elasticloadbalancing:AddTags` — for the audit tags `wait_for_alb` applies once the ALB is active. Failure here is non-fatal (logged as a warning, the scope creation proceeds). + +No new Kubernetes permissions are needed beyond those the agent already has for scope resources. ## Operational notes -- Scope creations that trigger autocreation are slower (typically 2–4 minutes extra). This is the expected behavior, not a regression. The platform logs `🔧 All candidate ALBs are at or above capacity (...); triggering autocreate` when it happens. -- The dummy Ingress (`nullplatform-autocreate-`) is created in the scope's namespace. It exposes no traffic and exists only to keep the ALB alive. Deleting it manually will cause the AWS Load Balancer Controller to delete the ALB. -- The ALB is registered through AWS tags rather than through the nullplatform provider configuration. Two consequences: - 1. The nullplatform provider object does not need to be updated by the script; this avoids requiring API credentials inside the scope workflow. +- Scope creations that trigger autocreation are slower (typically 2–4 minutes extra). This is the expected behavior, not a regression. The platform logs `🔧 Best candidate ALB '...' is at or above capacity (X/Y); triggering autocreate` when it happens, followed by `⏳ Still waiting for ALB '...' to become active (provisioning, ~30s elapsed)` heartbeats while the controller provisions. +- The dummy Ingress (`nullplatform-autocreate-`) is created in the scope's namespace. It exposes no real traffic — the rule points to a fixed `404` response via the standard `alb.ingress.kubernetes.io/actions.response-404` annotation — and exists only to keep the ALB alive in the eyes of the AWS Load Balancer Controller. Deleting the dummy Ingress will cause the controller to delete the ALB. +- The ALB is registered in the nullplatform provider (not in the customer's IaC). Two consequences: + 1. The provider becomes the source of truth for the ALB pool; subsequent scope creations read it directly. 2. The cloud's IaC (Terraform, OpenTofu, CloudFormation) is **not** updated automatically. If your IaC is the source of truth for ALB inventory, you should reconcile autocreated ALBs into it through your own process. ## Failure modes | Failure | Outcome | |---|---| -| Dummy Ingress template render fails | Scope creation exits 1 with `Failed to render ingress-dummy template`. | -| `kubectl apply` fails | Scope creation exits 1 with `Failed to apply ingress-dummy` and prints the namespace check hint. | +| `ALB_AUTOCREATE_NAME_PREFIX` invalid (bad charset or >18 chars) | Scope creation exits 1 with the validation error before any AWS or provider call. | +| `np provider list` cannot find a container-orchestration provider for the NRN | Scope creation exits 1 with `❌ No container-orchestration provider found for NRN ''`. | +| `np provider patch` fails (no API token / no write access) | Scope creation exits 1 with `❌ Failed to patch container-orchestration provider with new ALB` + hint about `NP_TOKEN` / `NULLPLATFORM_API_KEY`. | +| `gomplate` render of the dummy Ingress fails | Scope creation exits 1 with `❌ Failed to render ingress-dummy template`. | | ALB never reaches `active` within `ALB_AUTOCREATE_TIMEOUT_SECONDS` | Scope creation exits 1; check controller logs and AWS quota for ALBs in the region. | | AWS reports the ALB state as `failed` | Scope creation exits 1 immediately. | -| `AddTags` call fails (no IAM permission) | Logged as `⚠️ Could not tag ALB; subsequent discovery may not find it`. The scope creation continues; the next creation will not find this ALB by tag and may autocreate another one. | +| `AddTags` call fails (no IAM permission) | Logged as `⚠️ Could not tag ALB '' (audit only — provider registration already succeeded)`. The scope creation continues; the tags are documentation only. | ## What is out of scope diff --git a/k8s/scope/networking/autocreate_alb b/k8s/scope/networking/autocreate_alb index 2d39adef..935904dc 100644 --- a/k8s/scope/networking/autocreate_alb +++ b/k8s/scope/networking/autocreate_alb @@ -190,6 +190,22 @@ NAME_PREFIX=$(get_config_value \ --default "nullplatform-auto-" ) +# Final ALB name is "-<6 hex>". AWS rejects names that +# exceed 32 chars or contain anything outside [a-zA-Z0-9-]. If an invalid name +# slips through, the AWS Load Balancer Controller silently refuses to create +# the ALB and wait_for_alb hangs to timeout with an opaque error. Catch it +# here with a clear message instead. +if ! [[ "$NAME_PREFIX" =~ ^[a-z0-9-]+$ ]]; then + log error "❌ ALB_AUTOCREATE_NAME_PREFIX must match ^[a-z0-9-]+$, got: '$NAME_PREFIX'" + exit 1 +fi + +# 14 = len("private-") + 6 hex chars; total must stay ≤32 (AWS ALB name limit) +if [ "${#NAME_PREFIX}" -gt 18 ]; then + log error "❌ ALB_AUTOCREATE_NAME_PREFIX must be ≤18 chars (AWS caps ALB names at 32, the visibility+hex suffix uses 14); got ${#NAME_PREFIX}" + exit 1 +fi + NEW_ALB_NAME=$(generate_alb_name "$NAME_PREFIX" "$INGRESS_VISIBILITY") log info "🔧 Autocreating ALB '$NEW_ALB_NAME' (visibility=$INGRESS_VISIBILITY)" diff --git a/k8s/scope/networking/resolve_balancer b/k8s/scope/networking/resolve_balancer index e2ef333f..88e46bb4 100755 --- a/k8s/scope/networking/resolve_balancer +++ b/k8s/scope/networking/resolve_balancer @@ -37,16 +37,20 @@ _RESOLVE_BALANCER_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" get_alb_rule_count() { local alb_name="$1" - local alb_arn aws_stderr aws_exit - aws_stderr=$(aws elbv2 describe-load-balancers \ + # Single AWS call: capture stdout+stderr together. On success stdout has the + # ARN; on failure stderr has the error (we case-match LoadBalancerNotFound + # for the autocreate race). Avoids two `describe-load-balancers` calls per + # candidate per scope creation. + local alb_arn aws_exit + alb_arn=$(aws elbv2 describe-load-balancers \ --names "$alb_name" \ --region "$REGION" \ --query 'LoadBalancers[0].LoadBalancerArn' \ - --output text 2>&1 >/dev/null) + --output text 2>&1) aws_exit=$? if [ "$aws_exit" -ne 0 ]; then - case "$aws_stderr" in + case "$alb_arn" in *LoadBalancerNotFound*) # Concurrent autocreate race: the ALB was just registered in the # provider by another scope but is not yet visible to AWS APIs. Return @@ -61,12 +65,6 @@ get_alb_rule_count() { return 1 fi - alb_arn=$(aws elbv2 describe-load-balancers \ - --names "$alb_name" \ - --region "$REGION" \ - --query 'LoadBalancers[0].LoadBalancerArn' \ - --output text 2>/dev/null) - if [ -z "$alb_arn" ] || [ "$alb_arn" = "None" ]; then return 1 fi diff --git a/k8s/scope/networking/wait_for_alb b/k8s/scope/networking/wait_for_alb index 573f1f30..dda06a02 100644 --- a/k8s/scope/networking/wait_for_alb +++ b/k8s/scope/networking/wait_for_alb @@ -17,12 +17,27 @@ # ALB_AUTOCREATE_TIMEOUT_SECONDS - Max seconds to wait (default 300) # CONTEXT - For scope-id in the tag value +# Step gating: only block on ALB readiness for the path that actually relies +# on it — route53 scopes that manage ALBs via the AWS Load Balancer Controller. +# Other DNS types (azure, external_dns) provision a different networking stack +# and would never see the ALB transition to `active`, so the wait would always +# time out and fail the scope creation. +if [ "${DNS_TYPE:-}" != "route53" ]; then + log debug "📋 DNS type is '${DNS_TYPE:-unset}', skipping ALB active-state wait" + return 0 2>/dev/null || exit 0 +fi + TIMEOUT_SECONDS=$(get_config_value \ --env ALB_AUTOCREATE_TIMEOUT_SECONDS \ --provider '.providers["container-orchestration"].balancer.autocreate_timeout_seconds' \ --default "300" ) +if ! [[ "$TIMEOUT_SECONDS" =~ ^[1-9][0-9]*$ ]]; then + log error "❌ ALB_AUTOCREATE_TIMEOUT_SECONDS must be a positive integer, got: '$TIMEOUT_SECONDS'" + exit 1 +fi + deadline=$(($(date +%s) + TIMEOUT_SECONDS)) poll_interval=10 polls_per_heartbeat=3 diff --git a/k8s/scope/tests/networking/autocreate_alb.bats b/k8s/scope/tests/networking/autocreate_alb.bats index a54111be..2f0f7d06 100644 --- a/k8s/scope/tests/networking/autocreate_alb.bats +++ b/k8s/scope/tests/networking/autocreate_alb.bats @@ -261,6 +261,24 @@ teardown() { # ============================================================================= # Dummy host generation # ============================================================================= +@test "autocreate_alb: rejects ALB_AUTOCREATE_NAME_PREFIX with invalid chars (uppercase)" { + export ALB_AUTOCREATE_NAME_PREFIX="Bad-Prefix-" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB_AUTOCREATE_NAME_PREFIX must match ^[a-z0-9-]+\$, got: 'Bad-Prefix-'" +} + +@test "autocreate_alb: rejects ALB_AUTOCREATE_NAME_PREFIX longer than 18 chars" { + export ALB_AUTOCREATE_NAME_PREFIX="this-prefix-is-far-too-long-" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB_AUTOCREATE_NAME_PREFIX must be ≤18 chars" +} + @test "autocreate_alb: derives dummy host via domain-generate using the ALB name as scopeSlug" { # Run with the real domain-generate binary (no mock) — it's deterministic. # Use a custom gomplate that writes the rendered output through so we can diff --git a/k8s/scope/tests/networking/wait_for_alb.bats b/k8s/scope/tests/networking/wait_for_alb.bats index 6fed1b3f..16118154 100644 --- a/k8s/scope/tests/networking/wait_for_alb.bats +++ b/k8s/scope/tests/networking/wait_for_alb.bats @@ -17,6 +17,7 @@ setup() { export REGION="us-east-1" export ALB_NAME="test-alb" export INGRESS_VISIBILITY="internet-facing" + export DNS_TYPE="route53" export ALB_AUTOCREATE_TIMEOUT_SECONDS="2" export CONTEXT='{ @@ -161,6 +162,39 @@ mock_aws_state() { [[ "$output" != *"Could not tag ALB"* ]] } +@test "wait_for_alb: early returns for DNS_TYPE != route53 without polling" { + export DNS_TYPE="external_dns" + # If the polling code ran, the default `aws()` returning 1 would loop until + # timeout (2s) and exit 1. The early-return must skip that entirely. + aws() { echo "AWS SHOULD NOT BE CALLED" >&2; return 1; } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 DNS type is 'external_dns', skipping ALB active-state wait" + [[ "$output" != *"AWS SHOULD NOT BE CALLED"* ]] + [[ "$output" != *"Waiting up to"* ]] +} + +@test "wait_for_alb: exits with validation error when ALB_AUTOCREATE_TIMEOUT_SECONDS is non-numeric" { + export ALB_AUTOCREATE_TIMEOUT_SECONDS="not-a-number" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB_AUTOCREATE_TIMEOUT_SECONDS must be a positive integer, got: 'not-a-number'" +} + +@test "wait_for_alb: exits with validation error when ALB_AUTOCREATE_TIMEOUT_SECONDS is zero" { + export ALB_AUTOCREATE_TIMEOUT_SECONDS="0" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB_AUTOCREATE_TIMEOUT_SECONDS must be a positive integer, got: '0'" +} + @test "wait_for_alb: tag failure logs full warn message but exits 0" { export ALB_AUTOCREATED="true" local arn="arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/$ALB_NAME/abc" From a078c5167b76c876ae7535b3d5ab93cf4b322070 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 10 Jun 2026 17:33:27 -0300 Subject: [PATCH 13/14] docs(changelog): shorten autocreate entry to one-liner client summary --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14be5dfd..443e82bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -- Add opt-in ALB autocreation for k8s scopes: when every declared ALB is at or above `ALB_MAX_CAPACITY`, the platform provisions a new ALB via a dummy Ingress, registers it in the container-orchestration provider via `np provider patch`, and uses it for the scope. Disabled by default; enable with `ALB_AUTOCREATE_ENABLED=true`. Requires write access to the nullplatform API (via `NP_TOKEN` or `NULLPLATFORM_API_KEY`) and the additional AWS permission `elasticloadbalancing:AddTags` for audit tagging +- Add support to auto-create ALBs on scope create ## [1.12.0] - 2026-06-08 - Fix: do not inject file parameter as env vars From 47c98c29c56886cedc409d9fe73970eb53769c48 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Tue, 16 Jun 2026 12:42:44 -0300 Subject: [PATCH 14/14] feat(wait_for_alb): name the missing IAM permission in tag-failure warn --- k8s/scope/networking/wait_for_alb | 8 +++++--- k8s/scope/tests/networking/wait_for_alb.bats | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/k8s/scope/networking/wait_for_alb b/k8s/scope/networking/wait_for_alb index dda06a02..0909f39a 100644 --- a/k8s/scope/networking/wait_for_alb +++ b/k8s/scope/networking/wait_for_alb @@ -97,8 +97,9 @@ if [ "$state" != "active" ]; then fi # Audit tags — only on the scope that triggered the autocreate, so the cloud -# carries the lineage of which scope created which ALB. Discovery does not -# depend on these tags. +# carries the lineage of which scope created which ALB. Failure is non-fatal: +# the provider registration (the authoritative source) already succeeded; the +# tags are documentation only. if [ "${ALB_AUTOCREATED:-false}" = "true" ]; then scope_id=$(echo "$CONTEXT" | jq -r '.scope.id // "unknown"') if ! aws elbv2 add-tags \ @@ -109,7 +110,8 @@ if [ "${ALB_AUTOCREATED:-false}" = "true" ]; then "Key=nullplatform:visibility,Value=$INGRESS_VISIBILITY" \ "Key=nullplatform:created-by-scope-id,Value=$scope_id" \ >/dev/null 2>&1; then - log warn "⚠️ Could not tag ALB '$ALB_NAME' (audit only — provider registration already succeeded)" + log warn "⚠️ Could not tag ALB '$ALB_NAME' (audit only — provider registration already succeeded, continuing)" + log warn "💡 The agent role needs the IAM permission 'elasticloadbalancing:AddTags' on Application Load Balancers in region '$REGION' to write the audit tags (nullplatform:managed-by, nullplatform:visibility, nullplatform:created-by-scope-id)" else log debug "📋 Tagged ALB '$ALB_NAME' with managed-by=autocreate" fi diff --git a/k8s/scope/tests/networking/wait_for_alb.bats b/k8s/scope/tests/networking/wait_for_alb.bats index 16118154..db2492c8 100644 --- a/k8s/scope/tests/networking/wait_for_alb.bats +++ b/k8s/scope/tests/networking/wait_for_alb.bats @@ -195,7 +195,7 @@ mock_aws_state() { assert_contains "$output" "❌ ALB_AUTOCREATE_TIMEOUT_SECONDS must be a positive integer, got: '0'" } -@test "wait_for_alb: tag failure logs full warn message but exits 0" { +@test "wait_for_alb: tag failure logs warn with required IAM permission but exits 0" { export ALB_AUTOCREATED="true" local arn="arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/app/$ALB_NAME/abc" eval "aws() { @@ -214,5 +214,6 @@ mock_aws_state() { assert_equal "$status" "0" assert_contains "$output" "✅ ALB 'test-alb' is active" - assert_contains "$output" "⚠️ Could not tag ALB 'test-alb' (audit only — provider registration already succeeded)" + assert_contains "$output" "⚠️ Could not tag ALB 'test-alb' (audit only — provider registration already succeeded, continuing)" + assert_contains "$output" "💡 The agent role needs the IAM permission 'elasticloadbalancing:AddTags' on Application Load Balancers in region 'us-east-1' to write the audit tags (nullplatform:managed-by, nullplatform:visibility, nullplatform:created-by-scope-id)" }