From 172bc473a0f3d1d4f9019db80ae63621ce5d0ff6 Mon Sep 17 00:00:00 2001 From: Gabriel Eisbruch Date: Thu, 16 Apr 2026 19:44:40 -0300 Subject: [PATCH 01/56] Add kubectl helper command --- k8s/diagnose/tests/kubectl_get.bats | 371 ++++++++++++++++++++++++++++ k8s/kubectl_get | 166 +++++++++++++ 2 files changed, 537 insertions(+) create mode 100644 k8s/diagnose/tests/kubectl_get.bats create mode 100755 k8s/kubectl_get diff --git a/k8s/diagnose/tests/kubectl_get.bats b/k8s/diagnose/tests/kubectl_get.bats new file mode 100644 index 00000000..0057714a --- /dev/null +++ b/k8s/diagnose/tests/kubectl_get.bats @@ -0,0 +1,371 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for kubectl_get - read-only kubectl wrapper for troubleshooting +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + export SCRIPT="$PROJECT_ROOT/k8s/kubectl_get" + export K8S_NAMESPACE="default-ns" + + # Mock kubectl: echo back what was received so tests can assert the args. + kubectl() { + echo "kubectl-called: $*" + return 0 + } + export -f kubectl +} + +teardown() { + unset -f kubectl log + unset K8S_NAMESPACE SCRIPT PROJECT_ROOT +} + +# ============================================================================= +# Usage +# ============================================================================= +@test "kubectl_get: shows usage and exits 1 when no args provided" { + run bash "$SCRIPT" + + [ "$status" -eq 1 ] + assert_contains "$output" "Usage:" + assert_contains "$output" "kubectl get" +} + +# ============================================================================= +# Hardcoded verb: only 'get' can be invoked +# ============================================================================= +@test "kubectl_get: invokes kubectl with 'get' verb followed by user args" { + run bash "$SCRIPT" pods -o wide + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -o wide" +} + +# ============================================================================= +# Default namespace injection +# ============================================================================= +@test "kubectl_get: injects K8S_NAMESPACE when no namespace flag provided" { + run bash "$SCRIPT" pods + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -n default-ns" +} + +@test "kubectl_get: does not inject namespace when -n is provided" { + run bash "$SCRIPT" pods -n kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -n kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when --namespace is provided" { + run bash "$SCRIPT" pods --namespace kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods --namespace kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when --namespace=value form is provided" { + run bash "$SCRIPT" pods --namespace=kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods --namespace=kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when -A is provided" { + run bash "$SCRIPT" pods -A + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -A" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when --all-namespaces is provided" { + run bash "$SCRIPT" pods --all-namespaces + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods --all-namespaces" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when K8S_NAMESPACE is unset" { + unset K8S_NAMESPACE + + run bash "$SCRIPT" pods + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods" + [[ "$output" != *"-n "* ]] +} + +# ============================================================================= +# Blocked flags +# ============================================================================= +@test "kubectl_get: rejects --server" { + run bash "$SCRIPT" pods --server https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server'" +} + +@test "kubectl_get: rejects --server=value form" { + run bash "$SCRIPT" pods --server=https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server=https://evil.example.com'" +} + +@test "kubectl_get: rejects --kubeconfig" { + run bash "$SCRIPT" pods --kubeconfig /tmp/evil.yaml + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--kubeconfig'" +} + +@test "kubectl_get: rejects --token" { + run bash "$SCRIPT" pods --token abc123 + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +@test "kubectl_get: rejects --as (impersonation)" { + run bash "$SCRIPT" pods --as cluster-admin + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as'" +} + +@test "kubectl_get: rejects --as-group" { + run bash "$SCRIPT" pods --as-group system:masters + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as-group'" +} + +@test "kubectl_get: rejects --context" { + run bash "$SCRIPT" pods --context other-cluster + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--context'" +} + +@test "kubectl_get: rejects --insecure-skip-tls-verify" { + run bash "$SCRIPT" pods --insecure-skip-tls-verify + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--insecure-skip-tls-verify'" +} + +@test "kubectl_get: rejects -w (avoid hangs)" { + run bash "$SCRIPT" pods -w + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '-w'" +} + +@test "kubectl_get: rejects --watch" { + run bash "$SCRIPT" pods --watch + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--watch'" +} + +@test "kubectl_get: blocked flag in middle of args is still detected" { + run bash "$SCRIPT" pods -n my-ns --token abc123 -o yaml + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +# ============================================================================= +# Shell injection safety +# ============================================================================= +@test "kubectl_get: passes args verbatim — no shell interpretation of metachars" { + # If any of these metachars were interpreted by a shell, kubectl would + # never see them as part of a single arg. Mock echoes args back as-is. + run bash "$SCRIPT" pods -l 'app=foo;bar|baz`whoami`' + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -l app=foo;bar|baz\`whoami\` -n default-ns" +} + +# ============================================================================= +# Exit code propagation +# ============================================================================= +@test "kubectl_get: propagates kubectl exit code on failure" { + kubectl() { + echo "Error from server (NotFound): pods 'foo' not found" >&2 + return 1 + } + export -f kubectl + + run bash "$SCRIPT" pods foo + + [ "$status" -eq 1 ] +} + +# ============================================================================= +# Secret content stripping +# ============================================================================= +# Mock that returns realistic secret JSON when invoked with secret + -o json. +mock_kubectl_with_secrets() { + kubectl() { + if [[ "$*" == *"secret"* && "$*" == *"-o json"* ]]; then + # Single secret (when name is in args) returns object; otherwise list. + if [[ "$*" == *"secret foo"* || "$*" == *"secret/foo"* ]]; then + cat <<'EOF' +{ + "metadata": {"name": "foo", "namespace": "default-ns"}, + "type": "Opaque", + "data": {"password": "c3VwZXJzZWNyZXQ="}, + "stringData": {"plain": "alsosecret"} +} +EOF + else + cat <<'EOF' +{ + "items": [ + { + "metadata": {"name": "foo", "namespace": "default-ns"}, + "type": "Opaque", + "data": {"password": "c3VwZXJzZWNyZXQ="}, + "stringData": {"plain": "alsosecret"} + } + ] +} +EOF + fi + return 0 + fi + echo "kubectl-called: $*" + } + export -f kubectl +} + +@test "kubectl_get: strips .data and .stringData from secret list output" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secrets + + [ "$status" -eq 0 ] + # Metadata still present + assert_contains "$output" "\"name\": \"foo\"" + assert_contains "$output" "\"type\": \"Opaque\"" + # Sensitive content gone + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] + [[ "$output" != *"alsosecret"* ]] + [[ "$output" != *"\"data\""* ]] + [[ "$output" != *"\"stringData\""* ]] +} + +@test "kubectl_get: strips .data and .stringData from single secret output" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret foo + + [ "$status" -eq 0 ] + assert_contains "$output" "\"name\": \"foo\"" + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] + [[ "$output" != *"alsosecret"* ]] + [[ "$output" != *"\"data\""* ]] +} + +@test "kubectl_get: works for 'secret' (singular) resource name" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret + + [ "$status" -eq 0 ] + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: works for secret/name slash form" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret/foo + + [ "$status" -eq 0 ] + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: works for secret,configmap comma form" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret,configmap + + [ "$status" -eq 0 ] + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: forces -o json when user requested -o yaml on secrets" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secrets -o yaml + + [ "$status" -eq 0 ] + assert_contains "$output" "Output forced to JSON" + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: rejects -o jsonpath on secrets" { + run bash "$SCRIPT" secrets -o "jsonpath={.items[*].data.password}" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" + assert_contains "$output" "jsonpath" +} + +@test "kubectl_get: rejects -o go-template on secrets" { + run bash "$SCRIPT" secrets -o "go-template={{.items}}" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" + assert_contains "$output" "go-template" +} + +@test "kubectl_get: rejects -o custom-columns on secrets" { + run bash "$SCRIPT" secrets -o "custom-columns=NAME:.metadata.name,DATA:.data" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" + assert_contains "$output" "custom-columns" +} + +@test "kubectl_get: rejects --output=jsonpath= on secrets" { + run bash "$SCRIPT" secrets --output="jsonpath={.items[*].data}" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" +} + +@test "kubectl_get: secret filtering does not affect non-secret resources" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" pods -o yaml + + [ "$status" -eq 0 ] + # Goes through the normal (non-filtered) path: mock echoes args. + assert_contains "$output" "kubectl-called: get pods -o yaml -n default-ns" +} + +@test "kubectl_get: propagates kubectl failure exit code through jq pipe" { + kubectl() { + echo "Error from server (Forbidden)" >&2 + return 1 + } + export -f kubectl + + run bash "$SCRIPT" secrets + + [ "$status" -eq 1 ] +} diff --git a/k8s/kubectl_get b/k8s/kubectl_get new file mode 100755 index 00000000..7f55e687 --- /dev/null +++ b/k8s/kubectl_get @@ -0,0 +1,166 @@ +#!/bin/bash +# Read-only kubectl wrapper for troubleshooting. +# Hardcodes `kubectl get` so no other verb can be invoked, and rejects flags +# that change auth/server/context or could hang the script. When no namespace +# flag is supplied, defaults to $K8S_NAMESPACE so the agent can target the +# scope's namespace without repeating it. +# +# When 'secret' / 'secrets' is in the resource args, output is forced to JSON +# and piped through jq to strip .data and .stringData (the only fields that +# carry secret values). Output formats that can extract those fields directly +# (jsonpath, go-template, custom-columns) are rejected for secret queries. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [[ "$(type -t log 2>/dev/null)" != "function" ]]; then source "$SCRIPT_DIR/logging"; fi + +usage() { + cat >&2 < + +Runs 'kubectl get' with the provided arguments. Read-only: any other verb +or auth/context override is rejected. Secret values are stripped from output. + +Examples: + $(basename "$0") pods + $(basename "$0") pods -l app=foo -o yaml + $(basename "$0") events --sort-by=.lastTimestamp -n kube-system + $(basename "$0") nodes -A + $(basename "$0") secrets # data fields stripped +EOF +} + +if [[ $# -eq 0 ]]; then + usage + exit 1 +fi + +BLOCKED_FLAGS=( + --server + --kubeconfig + --token + --as + --as-group + --as-uid + --certificate-authority + --client-certificate + --client-key + --username + --password + --user + --cluster + --context + --insecure-skip-tls-verify + -w + --watch + --watch-only +) + +is_blocked() { + local flag_name="${1%%=*}" + for blocked in "${BLOCKED_FLAGS[@]}"; do + if [[ "$flag_name" == "$blocked" ]]; then + return 0 + fi + done + return 1 +} + +# True if any arg references the 'secret' / 'secrets' resource. Handles +# comma-separated resources (secret,configmap), slash form (secret/foo) +# and apiVersion form (secrets.v1). Uppercase tolerated since kubectl is +# case-insensitive on resource names. +involves_secrets() { + local arg lower token res + for arg in "$@"; do + lower="${arg,,}" + local IFS=, + for token in $lower; do + res="${token%%/*}" + res="${res%%.*}" + if [[ "$res" == "secret" || "$res" == "secrets" ]]; then + return 0 + fi + done + unset IFS + done + return 1 +} + +# Echoes the value of -o / --output if set; empty otherwise. +get_output_format() { + local prev="" arg + for arg in "$@"; do + if [[ "$prev" == "-o" || "$prev" == "--output" ]]; then + printf '%s\n' "$arg" + return + fi + case "$arg" in + -o=*|--output=*) printf '%s\n' "${arg#*=}"; return ;; + esac + prev="$arg" + done +} + +HAS_NAMESPACE_FLAG=false + +for arg in "$@"; do + if is_blocked "$arg"; then + log error "❌ Refusing argument '$arg': flag is blocked to keep this script read-only and safe." + log error " Blocked flags: ${BLOCKED_FLAGS[*]}" + exit 1 + fi + + case "${arg%%=*}" in + -n|--namespace|-A|--all-namespaces) + HAS_NAMESPACE_FLAG=true + ;; + esac +done + +ARGS=("$@") + +if [[ "$HAS_NAMESPACE_FLAG" == "false" ]] && [[ -n "${K8S_NAMESPACE:-}" ]]; then + ARGS+=(-n "$K8S_NAMESPACE") +fi + +if involves_secrets "${ARGS[@]}"; then + USER_OUTPUT="$(get_output_format "${ARGS[@]}")" + + case "$USER_OUTPUT" in + jsonpath*|go-template*|custom-columns*) + log error "❌ Refusing -o '$USER_OUTPUT' on secrets: this format can extract .data directly, bypassing the safety filter." + log error " Use -o yaml, -o json, -o wide, or no -o flag — values are stripped from data/stringData." + exit 1 + ;; + esac + + # Strip user's -o flag (we force JSON to feed jq). + STRIPPED=() + prev="" + for arg in "${ARGS[@]}"; do + if [[ "$prev" == "-o" || "$prev" == "--output" ]]; then + prev="" + continue + fi + case "$arg" in + -o|--output) prev="$arg"; continue ;; + -o=*|--output=*) prev=""; continue ;; + esac + prev="" + STRIPPED+=("$arg") + done + STRIPPED+=(-o json) + + if [[ -n "$USER_OUTPUT" && "$USER_OUTPUT" != "json" ]]; then + log info "ℹ️ Output forced to JSON (was: $USER_OUTPUT) — secret data fields are stripped for safety." + fi + + log debug "📋 Running: kubectl get ${STRIPPED[*]} | jq " + + kubectl get "${STRIPPED[@]}" | jq 'if .items then .items |= map(del(.data, .stringData)) else del(.data, .stringData) end' + exit "${PIPESTATUS[0]}" +fi + +log debug "📋 Running: kubectl get ${ARGS[*]}" + +kubectl get "${ARGS[@]}" From 5704885619078521e5aa87c16970468afb3f42b5 Mon Sep 17 00:00:00 2001 From: Gabriel Eisbruch Date: Thu, 16 Apr 2026 20:18:55 -0300 Subject: [PATCH 02/56] Add kubectl logs --- k8s/diagnose/tests/kubectl_logs.bats | 230 +++++++++++++++++++++++++++ k8s/kubectl_logs | 88 ++++++++++ 2 files changed, 318 insertions(+) create mode 100644 k8s/diagnose/tests/kubectl_logs.bats create mode 100755 k8s/kubectl_logs diff --git a/k8s/diagnose/tests/kubectl_logs.bats b/k8s/diagnose/tests/kubectl_logs.bats new file mode 100644 index 00000000..088fc9a7 --- /dev/null +++ b/k8s/diagnose/tests/kubectl_logs.bats @@ -0,0 +1,230 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for kubectl_logs - read-only, non-streaming kubectl logs wrapper +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + export SCRIPT="$PROJECT_ROOT/k8s/kubectl_logs" + export K8S_NAMESPACE="default-ns" + + # Mock kubectl: echo back what was received so tests can assert the args. + kubectl() { + echo "kubectl-called: $*" + return 0 + } + export -f kubectl +} + +teardown() { + unset -f kubectl log + unset K8S_NAMESPACE SCRIPT PROJECT_ROOT +} + +# ============================================================================= +# Usage +# ============================================================================= +@test "kubectl_logs: shows usage and exits 1 when no args provided" { + run bash "$SCRIPT" + + [ "$status" -eq 1 ] + assert_contains "$output" "Usage:" + assert_contains "$output" "kubectl logs" +} + +# ============================================================================= +# Hardcoded verb: only 'logs' can be invoked +# ============================================================================= +@test "kubectl_logs: invokes kubectl with 'logs' verb followed by user args" { + run bash "$SCRIPT" my-pod -c my-container + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod -c my-container" +} + +@test "kubectl_logs: passes --tail / --since / --previous through unchanged" { + run bash "$SCRIPT" my-pod --tail 200 --since 1h --previous + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod --tail 200 --since 1h --previous" +} + +# ============================================================================= +# Default namespace injection +# ============================================================================= +@test "kubectl_logs: injects K8S_NAMESPACE when no namespace flag provided" { + run bash "$SCRIPT" my-pod + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod -n default-ns" +} + +@test "kubectl_logs: does not inject namespace when -n is provided" { + run bash "$SCRIPT" my-pod -n kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod -n kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_logs: does not inject namespace when --namespace is provided" { + run bash "$SCRIPT" my-pod --namespace kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod --namespace kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_logs: does not inject namespace when --namespace=value form is provided" { + run bash "$SCRIPT" my-pod --namespace=kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod --namespace=kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_logs: does not inject namespace when K8S_NAMESPACE is unset" { + unset K8S_NAMESPACE + + run bash "$SCRIPT" my-pod + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod" + [[ "$output" != *"-n "* ]] +} + +# ============================================================================= +# Streaming flags are blocked +# ============================================================================= +@test "kubectl_logs: rejects -f (would stream)" { + run bash "$SCRIPT" my-pod -f + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '-f'" +} + +@test "kubectl_logs: rejects --follow (would stream)" { + run bash "$SCRIPT" my-pod --follow + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--follow'" +} + +@test "kubectl_logs: rejects --follow=true (would stream)" { + run bash "$SCRIPT" my-pod --follow=true + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--follow=true'" +} + +@test "kubectl_logs: rejects --follow=false too (simpler to block the flag entirely)" { + run bash "$SCRIPT" my-pod --follow=false + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--follow=false'" +} + +# ============================================================================= +# Blocked auth/context flags +# ============================================================================= +@test "kubectl_logs: rejects --server" { + run bash "$SCRIPT" my-pod --server https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server'" +} + +@test "kubectl_logs: rejects --server=value form" { + run bash "$SCRIPT" my-pod --server=https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server=https://evil.example.com'" +} + +@test "kubectl_logs: rejects --kubeconfig" { + run bash "$SCRIPT" my-pod --kubeconfig /tmp/evil.yaml + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--kubeconfig'" +} + +@test "kubectl_logs: rejects --token" { + run bash "$SCRIPT" my-pod --token abc123 + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +@test "kubectl_logs: rejects --as (impersonation)" { + run bash "$SCRIPT" my-pod --as cluster-admin + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as'" +} + +@test "kubectl_logs: rejects --as-group" { + run bash "$SCRIPT" my-pod --as-group system:masters + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as-group'" +} + +@test "kubectl_logs: rejects --context" { + run bash "$SCRIPT" my-pod --context other-cluster + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--context'" +} + +@test "kubectl_logs: rejects --insecure-skip-tls-verify" { + run bash "$SCRIPT" my-pod --insecure-skip-tls-verify + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--insecure-skip-tls-verify'" +} + +@test "kubectl_logs: blocked flag in middle of args is still detected" { + run bash "$SCRIPT" my-pod -n my-ns --token abc123 --tail 100 + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +@test "kubectl_logs: blocked streaming flag in middle of args is still detected" { + run bash "$SCRIPT" my-pod --tail 100 -f --timestamps + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '-f'" +} + +# ============================================================================= +# Shell injection safety +# ============================================================================= +@test "kubectl_logs: passes args verbatim — no shell interpretation of metachars" { + # If any of these metachars were interpreted by a shell, kubectl would + # never see them as part of a single arg. Mock echoes args back as-is. + run bash "$SCRIPT" -l 'app=foo;bar|baz`whoami`' + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs -l app=foo;bar|baz\`whoami\` -n default-ns" +} + +# ============================================================================= +# Exit code propagation +# ============================================================================= +@test "kubectl_logs: propagates kubectl exit code on failure" { + kubectl() { + echo "Error from server (NotFound): pods 'foo' not found" >&2 + return 1 + } + export -f kubectl + + run bash "$SCRIPT" foo + + [ "$status" -eq 1 ] +} diff --git a/k8s/kubectl_logs b/k8s/kubectl_logs new file mode 100755 index 00000000..80d36268 --- /dev/null +++ b/k8s/kubectl_logs @@ -0,0 +1,88 @@ +#!/bin/bash +# Read-only kubectl logs wrapper for troubleshooting. +# Hardcodes `kubectl logs` so no other verb can be invoked, and rejects +# flags that change auth/server/context or turn the call into a stream +# (--follow / -f), which would hang the script. When no namespace flag +# is supplied, defaults to $K8S_NAMESPACE so the agent can target the +# scope's namespace without repeating it. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [[ "$(type -t log 2>/dev/null)" != "function" ]]; then source "$SCRIPT_DIR/logging"; fi + +usage() { + cat >&2 < + +Runs 'kubectl logs' with the provided arguments. Read-only: any other +verb, auth/context override, or streaming flag (--follow / -f) is +rejected to keep invocations bounded. + +Examples: + $(basename "$0") my-pod + $(basename "$0") my-pod -c my-container --tail 200 + $(basename "$0") my-pod --previous + $(basename "$0") -l app=foo --tail 100 + $(basename "$0") my-pod --since 1h --timestamps +EOF +} + +if [[ $# -eq 0 ]]; then + usage + exit 1 +fi + +BLOCKED_FLAGS=( + --server + --kubeconfig + --token + --as + --as-group + --as-uid + --certificate-authority + --client-certificate + --client-key + --username + --password + --user + --cluster + --context + --insecure-skip-tls-verify + -f + --follow +) + +is_blocked() { + local flag_name="${1%%=*}" + for blocked in "${BLOCKED_FLAGS[@]}"; do + if [[ "$flag_name" == "$blocked" ]]; then + return 0 + fi + done + return 1 +} + +HAS_NAMESPACE_FLAG=false + +for arg in "$@"; do + if is_blocked "$arg"; then + log error "❌ Refusing argument '$arg': flag is blocked to keep this script read-only and non-streaming." + log error " Blocked flags: ${BLOCKED_FLAGS[*]}" + exit 1 + fi + + case "${arg%%=*}" in + -n|--namespace) + HAS_NAMESPACE_FLAG=true + ;; + esac +done + +ARGS=("$@") + +if [[ "$HAS_NAMESPACE_FLAG" == "false" ]] && [[ -n "${K8S_NAMESPACE:-}" ]]; then + ARGS+=(-n "$K8S_NAMESPACE") +fi + +log debug "📋 Running: kubectl logs ${ARGS[*]}" + +kubectl logs "${ARGS[@]}" From 40a52267f52cf5840548325776e7fd7747222baf Mon Sep 17 00:00:00 2001 From: Javi Date: Fri, 17 Apr 2026 14:47:22 -0300 Subject: [PATCH 03/56] Skip ALB capacity validations in scheduled_task workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scheduled_task scopes do not expose HTTP traffic via ALB, so the ALB capacity and target group capacity validations from the base k8s workflows are unnecessary. Override them with `action: skip` in the scheduled_task overlays and add structural tests that lock the contract with upstream step names — if a base step is renamed, the test fails instead of silently re-enabling the validation. Also adds .vscode/ to .gitignore. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 3 ++ .../deployment/tests/workflow_overrides.bats | 36 +++++++++++++ .../deployment/workflows/initial.yaml | 2 + .../scope/tests/workflow_overrides.bats | 54 +++++++++++++++++++ scheduled_task/scope/workflows/create.yaml | 2 + 5 files changed, 97 insertions(+) create mode 100644 scheduled_task/deployment/tests/workflow_overrides.bats create mode 100644 scheduled_task/scope/tests/workflow_overrides.bats diff --git a/.gitignore b/.gitignore index 10fe9d5c..10d7b0f7 100644 --- a/.gitignore +++ b/.gitignore @@ -151,3 +151,6 @@ testing/docker/certs/ # Claude Code .claude/ + +# Visual Studio Code +.vscode/ \ No newline at end of file diff --git a/scheduled_task/deployment/tests/workflow_overrides.bats b/scheduled_task/deployment/tests/workflow_overrides.bats new file mode 100644 index 00000000..591d00db --- /dev/null +++ b/scheduled_task/deployment/tests/workflow_overrides.bats @@ -0,0 +1,36 @@ +#!/usr/bin/env bats +# ============================================================================= +# Tests that verify scheduled_task deployment workflows override base k8s steps +# that do not apply to this scope type (e.g. ALB target group capacity). +# +# Contract: +# - Overlay must mark the step with `action: skip`. +# - Base workflow must still declare the step under the same name, otherwise +# a rename upstream would silently re-enable the step here. +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export OVERLAY="$PROJECT_ROOT/scheduled_task/deployment/workflows/initial.yaml" + export BASE="$PROJECT_ROOT/k8s/deployment/workflows/initial.yaml" +} + +# ============================================================================= +# validate alb target group capacity +# ============================================================================= +@test "base k8s deployment initial workflow declares 'validate alb target group capacity' step" { + run grep -A 2 "name: validate alb target group capacity" "$BASE" + + assert_equal "$status" "0" + assert_contains "$output" "type: script" + assert_contains "$output" "validate_alb_target_group_capacity" +} + +@test "scheduled_task deployment initial overlay skips 'validate alb target group capacity'" { + run grep -A 1 "name: validate alb target group capacity" "$OVERLAY" + + assert_equal "$status" "0" + assert_contains "$output" "action: skip" +} diff --git a/scheduled_task/deployment/workflows/initial.yaml b/scheduled_task/deployment/workflows/initial.yaml index 72535d20..91a87bde 100644 --- a/scheduled_task/deployment/workflows/initial.yaml +++ b/scheduled_task/deployment/workflows/initial.yaml @@ -1,6 +1,8 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: validate alb target group capacity + action: skip - name: route traffic action: skip - name: create deployment diff --git a/scheduled_task/scope/tests/workflow_overrides.bats b/scheduled_task/scope/tests/workflow_overrides.bats new file mode 100644 index 00000000..b4dfa3c7 --- /dev/null +++ b/scheduled_task/scope/tests/workflow_overrides.bats @@ -0,0 +1,54 @@ +#!/usr/bin/env bats +# ============================================================================= +# Tests that verify scheduled_task scope workflows override base k8s steps +# that do not apply to this scope type (e.g. ALB capacity validation). +# +# Contract: +# - Overlay must mark the step with `action: skip`. +# - Base workflow must still declare the step under the same name, otherwise +# the skip is a no-op and the base step would not run anyway (or worse, a +# rename upstream would silently re-enable the step here). +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export OVERLAY="$PROJECT_ROOT/scheduled_task/scope/workflows/create.yaml" + export BASE="$PROJECT_ROOT/k8s/scope/workflows/create.yaml" +} + +# ============================================================================= +# validate alb capacity +# ============================================================================= +@test "base k8s scope create workflow declares 'validate alb capacity' step" { + run grep -A 2 "name: validate alb capacity" "$BASE" + + assert_equal "$status" "0" + assert_contains "$output" "type: script" + assert_contains "$output" "validate_alb_capacity" +} + +@test "scheduled_task scope create overlay skips 'validate alb capacity'" { + run grep -A 1 "name: validate alb capacity" "$OVERLAY" + + assert_equal "$status" "0" + assert_contains "$output" "action: skip" +} + +# ============================================================================= +# networking (scheduled_task has no public traffic, so the whole block is skipped) +# ============================================================================= +@test "base k8s scope create workflow declares 'networking' step" { + run grep -A 1 "name: networking" "$BASE" + + assert_equal "$status" "0" + assert_contains "$output" "type: workflow" +} + +@test "scheduled_task scope create overlay skips 'networking'" { + run grep -A 1 "name: networking" "$OVERLAY" + + assert_equal "$status" "0" + assert_contains "$output" "action: skip" +} diff --git a/scheduled_task/scope/workflows/create.yaml b/scheduled_task/scope/workflows/create.yaml index 47156c7d..26b7820f 100644 --- a/scheduled_task/scope/workflows/create.yaml +++ b/scheduled_task/scope/workflows/create.yaml @@ -2,4 +2,6 @@ include: - "$SERVICE_PATH/values.yaml" steps: - name: networking + action: skip + - name: validate alb capacity action: skip \ No newline at end of file From 64c146b2fa9f14646b39e195936743bac1c70686 Mon Sep 17 00:00:00 2001 From: bjornaer Date: Thu, 16 Apr 2026 18:07:11 +0200 Subject: [PATCH 04/56] Add --no-params flag to skip parameter fetching for actions that don't need it Deployment actions like switch-traffic, kill-instances, and diagnose-deployment are purely operational and don't require application parameters. All scope actions (create, update, delete, etc.) deal with infrastructure, not app config. The flag is only added when the CLI supports it, preserving backward compatibility. --- service/deployment/entrypoint | 11 ++ service/deployment/tests/entrypoint.bats | 185 +++++++++++++++++++++++ service/scope/entrypoint | 4 + service/scope/tests/entrypoint.bats | 135 +++++++++++++++++ 4 files changed, 335 insertions(+) create mode 100644 service/deployment/tests/entrypoint.bats create mode 100644 service/scope/tests/entrypoint.bats diff --git a/service/deployment/entrypoint b/service/deployment/entrypoint index 7d7f9d97..9a1ee9b1 100755 --- a/service/deployment/entrypoint +++ b/service/deployment/entrypoint @@ -37,8 +37,19 @@ esac WORKFLOW_PATH="$SERVICE_PATH/deployment/workflows/$ACTION_TO_EXECUTE.yaml" +NEEDS_PARAMS=true +case "$SERVICE_ACTION" in + "switch-traffic"|"kill-instances"|"diagnose-deployment") + NEEDS_PARAMS=false + ;; +esac + CMD="np service workflow exec --workflow $WORKFLOW_PATH --build-context --include-secrets" +if [ "$NEEDS_PARAMS" = "false" ] && np service workflow exec --help 2>&1 | grep -q "\-\-no-params"; then + CMD="$CMD --no-params" +fi + IFS=',' read -ra OVERRIDE_PATHS <<< "$OVERRIDES_PATH" for path in "${OVERRIDE_PATHS[@]}"; do # Trim whitespace diff --git a/service/deployment/tests/entrypoint.bats b/service/deployment/tests/entrypoint.bats new file mode 100644 index 00000000..fac10a8f --- /dev/null +++ b/service/deployment/tests/entrypoint.bats @@ -0,0 +1,185 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for service/deployment/entrypoint - --no-params flag behavior +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export SCOPE_ID="scope-123" + export DEPLOYMENT_ID="deploy-456" + export SERVICE_PATH="/tmp/test-service-path" + export OVERRIDES_PATH="" + + mkdir -p "$SERVICE_PATH/deployment/workflows" + touch "$SERVICE_PATH/deployment/workflows/initial.yaml" + touch "$SERVICE_PATH/deployment/workflows/blue_green.yaml" + touch "$SERVICE_PATH/deployment/workflows/switch_traffic.yaml" + touch "$SERVICE_PATH/deployment/workflows/rollback.yaml" + touch "$SERVICE_PATH/deployment/workflows/finalize.yaml" + touch "$SERVICE_PATH/deployment/workflows/delete.yaml" + touch "$SERVICE_PATH/deployment/workflows/diagnose.yaml" + touch "$SERVICE_PATH/deployment/workflows/kill_instances.yaml" + + export NP_EXECUTED_CMD="" + export NP_HELP_SUPPORTS_NO_PARAMS="true" +} + +teardown() { + rm -rf "$SERVICE_PATH" + unset -f np +} + +mock_np() { + np() { + if [[ "$*" == *"--help"* ]]; then + if [ "$NP_HELP_SUPPORTS_NO_PARAMS" = "true" ]; then + echo " --no-params Skip parameter fetching" + fi + return 0 + fi + export NP_EXECUTED_CMD="np $*" + return 0 + } + export -f np +} + +# ============================================================================= +# Actions that SHOULD include --no-params (when CLI supports it) +# ============================================================================= + +@test "deployment entrypoint: switch-traffic includes --no-params when CLI supports it" { + mock_np + export SERVICE_ACTION="switch-traffic" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "deployment entrypoint: kill-instances includes --no-params when CLI supports it" { + mock_np + export SERVICE_ACTION="kill-instances" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "deployment entrypoint: diagnose-deployment includes --no-params when CLI supports it" { + mock_np + export SERVICE_ACTION="diagnose-deployment" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +# ============================================================================= +# Actions that SHOULD NOT include --no-params +# ============================================================================= + +@test "deployment entrypoint: start-initial does NOT include --no-params" { + mock_np + export SERVICE_ACTION="start-initial" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: start-blue-green does NOT include --no-params" { + mock_np + export SERVICE_ACTION="start-blue-green" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: rollback-deployment does NOT include --no-params" { + mock_np + export SERVICE_ACTION="rollback-deployment" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: finalize-blue-green does NOT include --no-params" { + mock_np + export SERVICE_ACTION="finalize-blue-green" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: delete-deployment does NOT include --no-params" { + mock_np + export SERVICE_ACTION="delete-deployment" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +# ============================================================================= +# Backward compatibility - old CLI without --no-params support +# ============================================================================= + +@test "deployment entrypoint: switch-traffic omits --no-params when CLI does not support it" { + export NP_HELP_SUPPORTS_NO_PARAMS="false" + mock_np + export SERVICE_ACTION="switch-traffic" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: kill-instances omits --no-params when CLI does not support it" { + export NP_HELP_SUPPORTS_NO_PARAMS="false" + mock_np + export SERVICE_ACTION="kill-instances" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +# ============================================================================= +# Edge cases +# ============================================================================= + +@test "deployment entrypoint: unknown action fails" { + mock_np + export SERVICE_ACTION="unknown-action" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 1 ] + assert_contains "$output" "Unknown action" +} + +@test "deployment entrypoint: --build-context and --include-secrets always present" { + mock_np + export SERVICE_ACTION="switch-traffic" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--build-context" + assert_contains "$output" "--include-secrets" +} diff --git a/service/scope/entrypoint b/service/scope/entrypoint index 721b1188..12b215a1 100755 --- a/service/scope/entrypoint +++ b/service/scope/entrypoint @@ -47,6 +47,10 @@ fi CMD="np service workflow exec --workflow $WORKFLOW_PATH --build-context --include-secrets" +if np service workflow exec --help 2>&1 | grep -q "\-\-no-params"; then + CMD="$CMD --no-params" +fi + IFS=',' read -ra OVERRIDE_PATHS <<< "$OVERRIDES_PATH" for path in "${OVERRIDE_PATHS[@]}"; do # Trim whitespace diff --git a/service/scope/tests/entrypoint.bats b/service/scope/tests/entrypoint.bats new file mode 100644 index 00000000..afcc5148 --- /dev/null +++ b/service/scope/tests/entrypoint.bats @@ -0,0 +1,135 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for service/scope/entrypoint - --no-params flag behavior +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export SCOPE_ID="scope-123" + export SERVICE_PATH="/tmp/test-service-path" + export OVERRIDES_PATH="" + + mkdir -p "$SERVICE_PATH/scope/workflows" + touch "$SERVICE_PATH/scope/workflows/create.yaml" + touch "$SERVICE_PATH/scope/workflows/update.yaml" + touch "$SERVICE_PATH/scope/workflows/delete.yaml" + touch "$SERVICE_PATH/scope/workflows/diagnose.yaml" + touch "$SERVICE_PATH/scope/workflows/restart-pods.yaml" + touch "$SERVICE_PATH/scope/workflows/pause-autoscaling.yaml" + touch "$SERVICE_PATH/scope/workflows/resume-autoscaling.yaml" + touch "$SERVICE_PATH/scope/workflows/set-desired-instance-count.yaml" + + export NP_HELP_SUPPORTS_NO_PARAMS="true" +} + +teardown() { + rm -rf "$SERVICE_PATH" + unset -f np +} + +mock_np() { + np() { + if [[ "$*" == *"--help"* ]]; then + if [ "$NP_HELP_SUPPORTS_NO_PARAMS" = "true" ]; then + echo " --no-params Skip parameter fetching" + fi + return 0 + fi + export NP_EXECUTED_CMD="np $*" + return 0 + } + export -f np +} + +# ============================================================================= +# All scope actions SHOULD include --no-params (when CLI supports it) +# ============================================================================= + +@test "scope entrypoint: create includes --no-params" { + mock_np + export SERVICE_ACTION="create-scope" + export SERVICE_ACTION_TYPE="create" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: update includes --no-params" { + mock_np + export SERVICE_ACTION="update-scope" + export SERVICE_ACTION_TYPE="update" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: delete includes --no-params" { + mock_np + export SERVICE_ACTION="delete-scope" + export SERVICE_ACTION_TYPE="custom" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: diagnose includes --no-params" { + mock_np + export SERVICE_ACTION="diagnose-scope" + export SERVICE_ACTION_TYPE="custom" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: restart-pods includes --no-params" { + mock_np + export SERVICE_ACTION="restart-pods" + export SERVICE_ACTION_TYPE="custom" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +# ============================================================================= +# Backward compatibility - old CLI without --no-params support +# ============================================================================= + +@test "scope entrypoint: omits --no-params when CLI does not support it" { + export NP_HELP_SUPPORTS_NO_PARAMS="false" + mock_np + export SERVICE_ACTION="create-scope" + export SERVICE_ACTION_TYPE="create" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +# ============================================================================= +# Core flags always present +# ============================================================================= + +@test "scope entrypoint: --build-context and --include-secrets always present" { + mock_np + export SERVICE_ACTION="create-scope" + export SERVICE_ACTION_TYPE="create" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--build-context" + assert_contains "$output" "--include-secrets" +} From ddfa38e76ce863ced67afbd9cb49bf4b1dbb5d6f Mon Sep 17 00:00:00 2001 From: David Fernandez Date: Thu, 30 Apr 2026 15:51:44 -0300 Subject: [PATCH 05/56] fix: support ELB hostnames for external-dns DNSEndpoint on AWS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AWS ELBs expose DNS hostnames (type=Hostname), not IPs (type=IPAddress). The manage_route script now falls back through four strategies: 1. Gateway IPAddress → A record 2. Gateway Hostname → CNAME record 3. Service LB IP → A record 4. Service LB hostname → CNAME record The dns-endpoint.yaml.tpl now uses dynamic record_type (A or CNAME) instead of hardcoded A, so DNSEndpoints are created correctly on AWS. Co-Authored-By: Claude Sonnet 4.6 --- .../templates/dns-endpoint.yaml.tpl | 2 +- .../networking/dns/external_dns/manage_route | 25 ++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/k8s/deployment/templates/dns-endpoint.yaml.tpl b/k8s/deployment/templates/dns-endpoint.yaml.tpl index e68e1903..8b6112f2 100644 --- a/k8s/deployment/templates/dns-endpoint.yaml.tpl +++ b/k8s/deployment/templates/dns-endpoint.yaml.tpl @@ -17,6 +17,6 @@ spec: endpoints: - dnsName: {{ .scope.domain }} recordTTL: 60 - recordType: A + recordType: {{ .record_type }} targets: - "{{ .gateway_ip }}" diff --git a/k8s/scope/networking/dns/external_dns/manage_route b/k8s/scope/networking/dns/external_dns/manage_route index f4fe1045..c8346d97 100644 --- a/k8s/scope/networking/dns/external_dns/manage_route +++ b/k8s/scope/networking/dns/external_dns/manage_route @@ -8,12 +8,30 @@ if [ "$ACTION" = "CREATE" ]; then GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ -o jsonpath='{.status.addresses[?(@.type=="IPAddress")].value}' 2>/dev/null) + RECORD_TYPE="A" if [ -z "$GATEWAY_IP" ]; then - log warn "⚠️ Gateway IP not found, trying service fallback..." + log warn "⚠️ Gateway IP not found, trying hostname..." + + GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.addresses[?(@.type=="Hostname")].value}' 2>/dev/null) + RECORD_TYPE="CNAME" + fi + + if [ -z "$GATEWAY_IP" ]; then + log warn "⚠️ Gateway hostname not found, trying service fallback..." GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null) + RECORD_TYPE="A" + fi + + if [ -z "$GATEWAY_IP" ]; then + log warn "⚠️ Gateway service IP not found, trying service hostname fallback..." + + GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) + RECORD_TYPE="CNAME" fi if [ -z "$GATEWAY_IP" ]; then @@ -21,7 +39,7 @@ if [ "$ACTION" = "CREATE" ]; then exit 0 fi - log info "✅ Gateway IP: $GATEWAY_IP" + log info "✅ Gateway address: $GATEWAY_IP (recordType: $RECORD_TYPE)" DNS_ENDPOINT_TEMPLATE="${DNS_ENDPOINT_TEMPLATE:-$SERVICE_PATH/deployment/templates/dns-endpoint.yaml.tpl}" @@ -29,7 +47,8 @@ if [ "$ACTION" = "CREATE" ]; then DNS_ENDPOINT_FILE="$OUTPUT_DIR/dns-endpoint-$SCOPE_ID.yaml" CONTEXT_PATH="$OUTPUT_DIR/context-$SCOPE_ID-dns.json" - echo "$CONTEXT" | jq --arg gateway_ip "$GATEWAY_IP" '. + {gateway_ip: $gateway_ip}' > "$CONTEXT_PATH" + echo "$CONTEXT" | jq --arg gateway_ip "$GATEWAY_IP" --arg record_type "$RECORD_TYPE" \ + '. + {gateway_ip: $gateway_ip, record_type: $record_type}' > "$CONTEXT_PATH" log debug "📝 Building DNSEndpoint from template: $DNS_ENDPOINT_TEMPLATE" From fe213045a1ef0eada6e0cbbf9781114374b5abb0 Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 11:30:50 -0300 Subject: [PATCH 06/56] fix(deployment): create DNSEndpoint for external_dns DNS type When DNS_TYPE is external_dns, verify_networking_reconciliation was skipping reconciliation entirely. Now it calls manage_route to resolve the gateway address, applies the DNSEndpoint to the cluster, and verifies HTTPRoute reconciliation. Co-Authored-By: Claude Sonnet 4.6 --- .../verify_networking_reconciliation.bats | 63 +++++++++++++++++++ .../verify_networking_reconciliation | 19 +++++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/k8s/deployment/tests/verify_networking_reconciliation.bats b/k8s/deployment/tests/verify_networking_reconciliation.bats index 7972e07e..93a7dd82 100644 --- a/k8s/deployment/tests/verify_networking_reconciliation.bats +++ b/k8s/deployment/tests/verify_networking_reconciliation.bats @@ -44,6 +44,69 @@ teardown() { assert_contains "$output" "⚠️ Skipping ALB verification (ALB access needed for blue-green traffic validation)" } +@test "verify_networking_reconciliation: creates DNSEndpoint and verifies HTTPRoute for external_dns" { + export DNS_TYPE="external_dns" + export SCOPE_VISIBILITY="internal" + export PRIVATE_GATEWAY_NAME="gateway-private" + export PUBLIC_GATEWAY_NAME="gateway-public" + export SCOPE_ID="123" + export K8S_NAMESPACE="nullplatform" + export OUTPUT_DIR="$(mktemp -d)" + export CONTEXT='{"scope":{"slug":"my-app","id":"123","domain":"app.example.com"}}' + + run bash -c " + kubectl() { + if [ \"\$1\" = 'get' ]; then + echo '{\"status\":{\"addresses\":[{\"type\":\"Hostname\",\"value\":\"my-alb.us-east-1.elb.amazonaws.com\"}]}}' + return 0 + fi + if [ \"\$1\" = 'apply' ]; then + echo 'dnsendpoint.externaldns.k8s.io/k-8-s-my-app-123-dns applied' + return 0 + fi + if [ \"\$1\" = 'httproute' ] || [ \"\$2\" = 'httproute' ]; then + echo '{\"status\":{\"parents\":[{\"conditions\":[{\"type\":\"Accepted\",\"status\":\"True\",\"reason\":\"Accepted\"},{\"type\":\"ResolvedRefs\",\"status\":\"True\",\"reason\":\"ResolvedRefs\"}]}]}}' + return 0 + fi + return 0 + } + export -f kubectl + gomplate() { echo 'rendered'; return 0; } + export -f gomplate + source '$BATS_TEST_DIRNAME/../verify_networking_reconciliation' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "🔍 Verifying networking reconciliation for DNS type: external_dns" + assert_contains "$output" "✅ DNSEndpoint applied to cluster" + + rm -rf "$OUTPUT_DIR" +} + +@test "verify_networking_reconciliation: uses public gateway when scope is not internal for external_dns" { + export DNS_TYPE="external_dns" + export SCOPE_VISIBILITY="public" + export PRIVATE_GATEWAY_NAME="gateway-private" + export PUBLIC_GATEWAY_NAME="gateway-public" + export SCOPE_ID="456" + export K8S_NAMESPACE="nullplatform" + export OUTPUT_DIR="$(mktemp -d)" + export CONTEXT='{"scope":{"slug":"my-app","id":"456","domain":"app.example.com"}}' + + run bash -c " + kubectl() { echo '{}'; return 0; } + export -f kubectl + gomplate() { echo 'rendered'; return 0; } + export -f gomplate + GATEWAY_NAME_USED='' + source '$BATS_TEST_DIRNAME/../verify_networking_reconciliation' + " + + assert_contains "$output" "gateway-public" + + rm -rf "$OUTPUT_DIR" +} + @test "verify_networking_reconciliation: skips for unsupported DNS types" { export DNS_TYPE="unknown" diff --git a/k8s/deployment/verify_networking_reconciliation b/k8s/deployment/verify_networking_reconciliation index 214c8530..dd5927f3 100644 --- a/k8s/deployment/verify_networking_reconciliation +++ b/k8s/deployment/verify_networking_reconciliation @@ -7,8 +7,25 @@ case "$DNS_TYPE" in route53) source "$SERVICE_PATH/deployment/verify_ingress_reconciliation" ;; + external_dns) + if [ "${SCOPE_VISIBILITY:-}" = "internal" ]; then + GATEWAY_NAME="${PRIVATE_GATEWAY_NAME:-gateway-private}" + else + GATEWAY_NAME="${PUBLIC_GATEWAY_NAME:-gateway-public}" + fi + + export ACTION="CREATE" GATEWAY_NAME="$GATEWAY_NAME" + source "$SERVICE_PATH/scope/networking/dns/external_dns/manage_route" + + DNS_ENDPOINT_FILE="$OUTPUT_DIR/dns-endpoint-$SCOPE_ID.yaml" + if [ -f "$DNS_ENDPOINT_FILE" ]; then + kubectl apply -f "$DNS_ENDPOINT_FILE" + log info "✅ DNSEndpoint applied to cluster" + fi + + source "$SERVICE_PATH/deployment/verify_http_route_reconciliation" + ;; *) log warn "⚠️ Ingress reconciliation not available for DNS type: $DNS_TYPE, skipping" -# source "$SERVICE_PATH/deployment/verify_http_route_reconciliation" ;; esac From 595385541bff9c04ff3f1332c52f970b5c4231d3 Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 12:11:44 -0300 Subject: [PATCH 07/56] fix(wait_on_balancer): use DNSEndpoint observedGeneration instead of nslookup for external_dns nslookup against 8.8.8.8 fails for private Route53 zones and domains without public NS delegation. external-dns sets status.observedGeneration=1 once it processes the DNSEndpoint, which is a reliable signal that the Route53 record was created. Co-Authored-By: Claude Sonnet 4.6 --- k8s/scope/wait_on_balancer | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/k8s/scope/wait_on_balancer b/k8s/scope/wait_on_balancer index 972f4c02..55c30509 100644 --- a/k8s/scope/wait_on_balancer +++ b/k8s/scope/wait_on_balancer @@ -35,22 +35,16 @@ case "$DNS_TYPE" in DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" log debug "📋 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME" - DNS_STATUS=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" -o jsonpath='{.status}' 2>/dev/null || echo "not found") - - if [ "$DNS_STATUS" != "not found" ] && [ -n "$DNS_STATUS" ]; then - log debug "📋 DNSEndpoint status: $DNS_STATUS" - fi - - if nslookup "$SCOPE_DOMAIN" 8.8.8.8 >/dev/null 2>&1; then - log info " ✅ DNS record for $SCOPE_DOMAIN is now resolvable" - - RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 | grep -A1 "Name:" | tail -1 | awk '{print $2}' 2>/dev/null || echo "unknown") - log info " ✅ Domain $SCOPE_DOMAIN resolves to: $RESOLVED_IP" + OBSERVED_GEN=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" \ + -o jsonpath='{.status.observedGeneration}' 2>/dev/null || echo "") + if [ -n "$OBSERVED_GEN" ] && [ "$OBSERVED_GEN" -ge 1 ] 2>/dev/null; then + log info " ✅ DNSEndpoint $DNS_ENDPOINT_NAME processed by external-dns (observedGeneration=$OBSERVED_GEN)" + log info " ✅ DNS record for $SCOPE_DOMAIN has been created in Route53" break fi - log debug "📋 DNS record not yet available, waiting 10s..." + log debug "📋 DNSEndpoint not yet processed by external-dns (observedGeneration=${OBSERVED_GEN:-unset}), waiting 10s..." sleep 10 done From 4b2813f0f962b774f6dad924734342c2d2077a8a Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 12:31:41 -0300 Subject: [PATCH 08/56] fix(manage_route): resolve ALB hostname via Ingress when gateway returns cluster-internal address Istio gateways report their status address as the ClusterIP service name (gateway-public-istio.gateways.svc.cluster.local), not the external ALB hostname. Added a fallback that reads the hostname from the ALB Ingress (gateway-alb-public / gateway-alb-private) when a .svc.cluster.local address is detected. Co-Authored-By: Claude Sonnet 4.6 --- .../networking/dns/external_dns/manage_route | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/k8s/scope/networking/dns/external_dns/manage_route b/k8s/scope/networking/dns/external_dns/manage_route index c8346d97..402df021 100644 --- a/k8s/scope/networking/dns/external_dns/manage_route +++ b/k8s/scope/networking/dns/external_dns/manage_route @@ -39,6 +39,21 @@ if [ "$ACTION" = "CREATE" ]; then exit 0 fi + # If the resolved address is a cluster-internal service name, look for the ALB via Ingress + if echo "$GATEWAY_IP" | grep -q "\.svc\.cluster\.local"; then + log warn "⚠️ Gateway address is cluster-internal ($GATEWAY_IP), trying ALB Ingress fallback..." + GATEWAY_SUFFIX="${GATEWAY_NAME#gateway-}" + ALB_HOSTNAME=$(kubectl get ingress "gateway-alb-${GATEWAY_SUFFIX}" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) + if [ -n "$ALB_HOSTNAME" ]; then + GATEWAY_IP="$ALB_HOSTNAME" + RECORD_TYPE="CNAME" + log info "✅ ALB hostname resolved via Ingress: $GATEWAY_IP" + else + log warn "⚠️ ALB Ingress hostname not found, keeping cluster-internal address" + fi + fi + log info "✅ Gateway address: $GATEWAY_IP (recordType: $RECORD_TYPE)" DNS_ENDPOINT_TEMPLATE="${DNS_ENDPOINT_TEMPLATE:-$SERVICE_PATH/deployment/templates/dns-endpoint.yaml.tpl}" From b30faab571dfd7d3bfb5971c92355050780c3335 Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 12:59:57 -0300 Subject: [PATCH 09/56] refactor(manage_route): try ALB Ingress first, fall back to gateway address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous approach resolved the gateway address first and only checked the ALB Ingress when the result was cluster-internal. Reversed the priority: ALB Ingress (gateway-alb-) is checked first since it's the AWS-specific override. If not present, falls back to the standard gateway address resolution chain (IPAddress → Hostname → Service IP → Service hostname), which is the common case for environments with a real external gateway. Co-Authored-By: Claude Sonnet 4.6 --- .../networking/dns/external_dns/manage_route | 67 +++++++++---------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/k8s/scope/networking/dns/external_dns/manage_route b/k8s/scope/networking/dns/external_dns/manage_route index 402df021..65e05bed 100644 --- a/k8s/scope/networking/dns/external_dns/manage_route +++ b/k8s/scope/networking/dns/external_dns/manage_route @@ -6,32 +6,44 @@ if [ "$ACTION" = "CREATE" ]; then log debug "🔍 Building DNSEndpoint manifest for ExternalDNS..." log debug "📡 Getting IP for gateway: $GATEWAY_NAME" - GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ - -o jsonpath='{.status.addresses[?(@.type=="IPAddress")].value}' 2>/dev/null) - RECORD_TYPE="A" - - if [ -z "$GATEWAY_IP" ]; then - log warn "⚠️ Gateway IP not found, trying hostname..." + # Try ALB Ingress first (AWS-specific: gateway-alb-public / gateway-alb-private) + GATEWAY_SUFFIX="${GATEWAY_NAME#gateway-}" + GATEWAY_IP=$(kubectl get ingress "gateway-alb-${GATEWAY_SUFFIX}" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) + RECORD_TYPE="CNAME" + + if [ -n "$GATEWAY_IP" ]; then + log debug "📡 ALB hostname resolved via Ingress: $GATEWAY_IP" + else + log debug "📡 ALB Ingress not found, resolving gateway address directly..." GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ - -o jsonpath='{.status.addresses[?(@.type=="Hostname")].value}' 2>/dev/null) - RECORD_TYPE="CNAME" - fi + -o jsonpath='{.status.addresses[?(@.type=="IPAddress")].value}' 2>/dev/null) + RECORD_TYPE="A" - if [ -z "$GATEWAY_IP" ]; then - log warn "⚠️ Gateway hostname not found, trying service fallback..." + if [ -z "$GATEWAY_IP" ]; then + log warn "⚠️ Gateway IP not found, trying hostname..." - GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ - -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null) - RECORD_TYPE="A" - fi + GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.addresses[?(@.type=="Hostname")].value}' 2>/dev/null) + RECORD_TYPE="CNAME" + fi - if [ -z "$GATEWAY_IP" ]; then - log warn "⚠️ Gateway service IP not found, trying service hostname fallback..." + if [ -z "$GATEWAY_IP" ]; then + log warn "⚠️ Gateway hostname not found, trying service fallback..." + + GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null) + RECORD_TYPE="A" + fi - GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ - -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) - RECORD_TYPE="CNAME" + if [ -z "$GATEWAY_IP" ]; then + log warn "⚠️ Gateway service IP not found, trying service hostname fallback..." + + GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) + RECORD_TYPE="CNAME" + fi fi if [ -z "$GATEWAY_IP" ]; then @@ -39,21 +51,6 @@ if [ "$ACTION" = "CREATE" ]; then exit 0 fi - # If the resolved address is a cluster-internal service name, look for the ALB via Ingress - if echo "$GATEWAY_IP" | grep -q "\.svc\.cluster\.local"; then - log warn "⚠️ Gateway address is cluster-internal ($GATEWAY_IP), trying ALB Ingress fallback..." - GATEWAY_SUFFIX="${GATEWAY_NAME#gateway-}" - ALB_HOSTNAME=$(kubectl get ingress "gateway-alb-${GATEWAY_SUFFIX}" -n gateways \ - -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) - if [ -n "$ALB_HOSTNAME" ]; then - GATEWAY_IP="$ALB_HOSTNAME" - RECORD_TYPE="CNAME" - log info "✅ ALB hostname resolved via Ingress: $GATEWAY_IP" - else - log warn "⚠️ ALB Ingress hostname not found, keeping cluster-internal address" - fi - fi - log info "✅ Gateway address: $GATEWAY_IP (recordType: $RECORD_TYPE)" DNS_ENDPOINT_TEMPLATE="${DNS_ENDPOINT_TEMPLATE:-$SERVICE_PATH/deployment/templates/dns-endpoint.yaml.tpl}" From e7b783b201b76512bccc316edbf70bf5a9a4373f Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 13:02:34 -0300 Subject: [PATCH 10/56] revert(wait_on_balancer): restore nslookup check for external_dns to test with correct ALB target The previous nslookup against 8.8.8.8 was failing because the CNAME pointed to a cluster-internal gateway address. Now that manage_route resolves the real ALB hostname first, testing whether public DNS resolution works correctly. Co-Authored-By: Claude Sonnet 4.6 --- k8s/scope/wait_on_balancer | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/k8s/scope/wait_on_balancer b/k8s/scope/wait_on_balancer index 55c30509..972f4c02 100644 --- a/k8s/scope/wait_on_balancer +++ b/k8s/scope/wait_on_balancer @@ -35,16 +35,22 @@ case "$DNS_TYPE" in DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" log debug "📋 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME" - OBSERVED_GEN=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" \ - -o jsonpath='{.status.observedGeneration}' 2>/dev/null || echo "") + DNS_STATUS=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" -o jsonpath='{.status}' 2>/dev/null || echo "not found") + + if [ "$DNS_STATUS" != "not found" ] && [ -n "$DNS_STATUS" ]; then + log debug "📋 DNSEndpoint status: $DNS_STATUS" + fi + + if nslookup "$SCOPE_DOMAIN" 8.8.8.8 >/dev/null 2>&1; then + log info " ✅ DNS record for $SCOPE_DOMAIN is now resolvable" + + RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 | grep -A1 "Name:" | tail -1 | awk '{print $2}' 2>/dev/null || echo "unknown") + log info " ✅ Domain $SCOPE_DOMAIN resolves to: $RESOLVED_IP" - if [ -n "$OBSERVED_GEN" ] && [ "$OBSERVED_GEN" -ge 1 ] 2>/dev/null; then - log info " ✅ DNSEndpoint $DNS_ENDPOINT_NAME processed by external-dns (observedGeneration=$OBSERVED_GEN)" - log info " ✅ DNS record for $SCOPE_DOMAIN has been created in Route53" break fi - log debug "📋 DNSEndpoint not yet processed by external-dns (observedGeneration=${OBSERVED_GEN:-unset}), waiting 10s..." + log debug "📋 DNS record not yet available, waiting 10s..." sleep 10 done From ae40e108c90a68a44fbe9f4d3b3f1c638ecbed7e Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 14:34:57 -0300 Subject: [PATCH 11/56] fix(wait_on_balancer): fix nslookup IP parsing to skip server address line Co-Authored-By: Claude Sonnet 4.6 --- k8s/scope/wait_on_balancer | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/scope/wait_on_balancer b/k8s/scope/wait_on_balancer index 972f4c02..bffa6394 100644 --- a/k8s/scope/wait_on_balancer +++ b/k8s/scope/wait_on_balancer @@ -44,7 +44,7 @@ case "$DNS_TYPE" in if nslookup "$SCOPE_DOMAIN" 8.8.8.8 >/dev/null 2>&1; then log info " ✅ DNS record for $SCOPE_DOMAIN is now resolvable" - RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 | grep -A1 "Name:" | tail -1 | awk '{print $2}' 2>/dev/null || echo "unknown") + RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 2>/dev/null | awk '/^Address/ && !/8\.8\.8\.8/ {print $2; exit}' || echo "unknown") log info " ✅ Domain $SCOPE_DOMAIN resolves to: $RESOLVED_IP" break From fb8ff48e321233b6fa73f48948c04c600aa6883c Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 15:18:48 -0300 Subject: [PATCH 12/56] fix(dns): add dns/zone-type label to DNSEndpoints to prevent cross-zone record creation Both external-dns-public and external-dns-private controllers process all DNSEndpoints, causing public scope records to appear in the private hosted zone and vice versa. Add a dns/zone-type label (public|private) derived from SCOPE_VISIBILITY so each controller can filter only the records it owns via --label-filter. Co-Authored-By: Claude Sonnet 4.6 --- k8s/deployment/templates/dns-endpoint.yaml.tpl | 1 + k8s/scope/networking/dns/external_dns/manage_route | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/k8s/deployment/templates/dns-endpoint.yaml.tpl b/k8s/deployment/templates/dns-endpoint.yaml.tpl index 8b6112f2..1c98848c 100644 --- a/k8s/deployment/templates/dns-endpoint.yaml.tpl +++ b/k8s/deployment/templates/dns-endpoint.yaml.tpl @@ -13,6 +13,7 @@ metadata: application_id: "{{ .application.id }}" scope: {{ .scope.slug }} scope_id: "{{ .scope.id }}" + dns/zone-type: {{ .dns_zone_type | default "public" }} spec: endpoints: - dnsName: {{ .scope.domain }} diff --git a/k8s/scope/networking/dns/external_dns/manage_route b/k8s/scope/networking/dns/external_dns/manage_route index 65e05bed..b161123e 100644 --- a/k8s/scope/networking/dns/external_dns/manage_route +++ b/k8s/scope/networking/dns/external_dns/manage_route @@ -59,8 +59,15 @@ if [ "$ACTION" = "CREATE" ]; then DNS_ENDPOINT_FILE="$OUTPUT_DIR/dns-endpoint-$SCOPE_ID.yaml" CONTEXT_PATH="$OUTPUT_DIR/context-$SCOPE_ID-dns.json" + if [ "${SCOPE_VISIBILITY:-}" = "public" ]; then + DNS_ZONE_TYPE="public" + else + DNS_ZONE_TYPE="private" + fi + echo "$CONTEXT" | jq --arg gateway_ip "$GATEWAY_IP" --arg record_type "$RECORD_TYPE" \ - '. + {gateway_ip: $gateway_ip, record_type: $record_type}' > "$CONTEXT_PATH" + --arg dns_zone_type "$DNS_ZONE_TYPE" \ + '. + {gateway_ip: $gateway_ip, record_type: $record_type, dns_zone_type: $dns_zone_type}' > "$CONTEXT_PATH" log debug "📝 Building DNSEndpoint from template: $DNS_ENDPOINT_TEMPLATE" From 6710a7d7c881bb279b4f1f6c402db8a960e03d72 Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 4 May 2026 16:25:37 -0300 Subject: [PATCH 13/56] fix(wait_on_balancer): use observedGeneration for private scopes instead of nslookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Private scopes use the internal Route53 hosted zone which is not resolvable via public DNS (8.8.8.8). Poll status.observedGeneration on the DNSEndpoint instead — set to >=1 by external-dns when the record is processed. Public scopes keep the existing nslookup-based check. Co-Authored-By: Claude Sonnet 4.6 --- k8s/scope/wait_on_balancer | 110 ++++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 39 deletions(-) diff --git a/k8s/scope/wait_on_balancer b/k8s/scope/wait_on_balancer index bffa6394..5aebcca1 100644 --- a/k8s/scope/wait_on_balancer +++ b/k8s/scope/wait_on_balancer @@ -11,48 +11,80 @@ case "$DNS_TYPE" in SCOPE_DOMAIN=$(echo "$CONTEXT" | jq -r '.scope.domain') SCOPE_SLUG=$(echo "$CONTEXT" | jq -r '.scope.slug') SCOPE_ID=$(echo "$CONTEXT" | jq -r '.scope.id') + DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" log debug "📋 Checking ExternalDNS record creation for domain: $SCOPE_DOMAIN" - while true; do - iteration=$((iteration + 1)) - if [ $iteration -gt $MAX_ITERATIONS ]; then - log error "" - log error " ❌ DNS record creation timeout after $((MAX_ITERATIONS * 10))s" - log error "" - log error "💡 Possible causes:" - log error " ExternalDNS may still be processing the DNSEndpoint resource" - log error "" - log error "🔧 How to fix:" - log error " • Check DNSEndpoint resources: kubectl get dnsendpoint -A" - log error " • Check ExternalDNS logs: kubectl logs -n external-dns -l app=external-dns --tail=50" - log error "" - exit 1 - fi - - log debug "🔍 Checking DNS resolution for $SCOPE_DOMAIN (attempt $iteration/$MAX_ITERATIONS)" - - DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" - log debug "📋 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME" - - DNS_STATUS=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" -o jsonpath='{.status}' 2>/dev/null || echo "not found") - - if [ "$DNS_STATUS" != "not found" ] && [ -n "$DNS_STATUS" ]; then - log debug "📋 DNSEndpoint status: $DNS_STATUS" - fi - - if nslookup "$SCOPE_DOMAIN" 8.8.8.8 >/dev/null 2>&1; then - log info " ✅ DNS record for $SCOPE_DOMAIN is now resolvable" - - RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 2>/dev/null | awk '/^Address/ && !/8\.8\.8\.8/ {print $2; exit}' || echo "unknown") - log info " ✅ Domain $SCOPE_DOMAIN resolves to: $RESOLVED_IP" - - break - fi - - log debug "📋 DNS record not yet available, waiting 10s..." - sleep 10 - done + # Private scopes use the internal hosted zone — not resolvable via public DNS. + # For those, wait for observedGeneration instead of DNS resolution. + if [ "${SCOPE_VISIBILITY:-}" != "public" ]; then + log debug "📋 Private scope — waiting for DNSEndpoint observedGeneration (not resolvable via public DNS)" + + while true; do + iteration=$((iteration + 1)) + if [ $iteration -gt $MAX_ITERATIONS ]; then + log error "" + log error " ❌ DNSEndpoint processing timeout after $((MAX_ITERATIONS * 10))s" + log error "" + log error "💡 Possible causes:" + log error " ExternalDNS may still be processing the DNSEndpoint resource" + log error "" + log error "🔧 How to fix:" + log error " • Check DNSEndpoint resources: kubectl get dnsendpoint -A" + log error " • Check ExternalDNS logs: kubectl logs -n external-dns -l app=external-dns --tail=50" + log error "" + exit 1 + fi + + log debug "🔍 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME (attempt $iteration/$MAX_ITERATIONS)" + + OBSERVED_GEN=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" \ + -o jsonpath='{.status.observedGeneration}' 2>/dev/null || echo "") + + if [ "${OBSERVED_GEN:-0}" -ge 1 ] 2>/dev/null; then + log info " ✅ DNSEndpoint $DNS_ENDPOINT_NAME processed by ExternalDNS (observedGeneration=$OBSERVED_GEN)" + break + fi + + log debug "📋 DNSEndpoint not yet processed, waiting 10s..." + sleep 10 + done + else + while true; do + iteration=$((iteration + 1)) + if [ $iteration -gt $MAX_ITERATIONS ]; then + log error "" + log error " ❌ DNS record creation timeout after $((MAX_ITERATIONS * 10))s" + log error "" + log error "💡 Possible causes:" + log error " ExternalDNS may still be processing the DNSEndpoint resource" + log error "" + log error "🔧 How to fix:" + log error " • Check DNSEndpoint resources: kubectl get dnsendpoint -A" + log error " • Check ExternalDNS logs: kubectl logs -n external-dns -l app=external-dns --tail=50" + log error "" + exit 1 + fi + + log debug "🔍 Checking DNS resolution for $SCOPE_DOMAIN (attempt $iteration/$MAX_ITERATIONS)" + log debug "📋 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME" + + DNS_STATUS=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" -o jsonpath='{.status}' 2>/dev/null || echo "not found") + if [ "$DNS_STATUS" != "not found" ] && [ -n "$DNS_STATUS" ]; then + log debug "📋 DNSEndpoint status: $DNS_STATUS" + fi + + if nslookup "$SCOPE_DOMAIN" 8.8.8.8 >/dev/null 2>&1; then + log info " ✅ DNS record for $SCOPE_DOMAIN is now resolvable" + RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 2>/dev/null | awk '/^Address/ && !/8\.8\.8\.8/ {print $2; exit}' || echo "unknown") + log info " ✅ Domain $SCOPE_DOMAIN resolves to: $RESOLVED_IP" + break + fi + + log debug "📋 DNS record not yet available, waiting 10s..." + sleep 10 + done + fi log info "" log info "✨ ExternalDNS setup completed successfully" From d2a7eefa559e6a2dcb606d711373a7d00792f3fe Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Tue, 5 May 2026 10:00:39 -0300 Subject: [PATCH 14/56] refactor(wait_on_balancer): use observedGeneration for all external_dns scopes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace nslookup-based DNS resolution check with DNSEndpoint observedGeneration polling for all scopes (public and private). nslookup was unreliable due to high cluster DNS cache TTL. observedGeneration is set by external-dns when it processes the record — faster and works consistently regardless of zone visibility. Co-Authored-By: Claude Sonnet 4.6 --- k8s/scope/wait_on_balancer | 99 +++++++++++--------------------------- 1 file changed, 29 insertions(+), 70 deletions(-) diff --git a/k8s/scope/wait_on_balancer b/k8s/scope/wait_on_balancer index 5aebcca1..d3895725 100644 --- a/k8s/scope/wait_on_balancer +++ b/k8s/scope/wait_on_balancer @@ -15,76 +15,35 @@ case "$DNS_TYPE" in log debug "📋 Checking ExternalDNS record creation for domain: $SCOPE_DOMAIN" - # Private scopes use the internal hosted zone — not resolvable via public DNS. - # For those, wait for observedGeneration instead of DNS resolution. - if [ "${SCOPE_VISIBILITY:-}" != "public" ]; then - log debug "📋 Private scope — waiting for DNSEndpoint observedGeneration (not resolvable via public DNS)" - - while true; do - iteration=$((iteration + 1)) - if [ $iteration -gt $MAX_ITERATIONS ]; then - log error "" - log error " ❌ DNSEndpoint processing timeout after $((MAX_ITERATIONS * 10))s" - log error "" - log error "💡 Possible causes:" - log error " ExternalDNS may still be processing the DNSEndpoint resource" - log error "" - log error "🔧 How to fix:" - log error " • Check DNSEndpoint resources: kubectl get dnsendpoint -A" - log error " • Check ExternalDNS logs: kubectl logs -n external-dns -l app=external-dns --tail=50" - log error "" - exit 1 - fi - - log debug "🔍 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME (attempt $iteration/$MAX_ITERATIONS)" - - OBSERVED_GEN=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" \ - -o jsonpath='{.status.observedGeneration}' 2>/dev/null || echo "") - - if [ "${OBSERVED_GEN:-0}" -ge 1 ] 2>/dev/null; then - log info " ✅ DNSEndpoint $DNS_ENDPOINT_NAME processed by ExternalDNS (observedGeneration=$OBSERVED_GEN)" - break - fi - - log debug "📋 DNSEndpoint not yet processed, waiting 10s..." - sleep 10 - done - else - while true; do - iteration=$((iteration + 1)) - if [ $iteration -gt $MAX_ITERATIONS ]; then - log error "" - log error " ❌ DNS record creation timeout after $((MAX_ITERATIONS * 10))s" - log error "" - log error "💡 Possible causes:" - log error " ExternalDNS may still be processing the DNSEndpoint resource" - log error "" - log error "🔧 How to fix:" - log error " • Check DNSEndpoint resources: kubectl get dnsendpoint -A" - log error " • Check ExternalDNS logs: kubectl logs -n external-dns -l app=external-dns --tail=50" - log error "" - exit 1 - fi - - log debug "🔍 Checking DNS resolution for $SCOPE_DOMAIN (attempt $iteration/$MAX_ITERATIONS)" - log debug "📋 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME" - - DNS_STATUS=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" -o jsonpath='{.status}' 2>/dev/null || echo "not found") - if [ "$DNS_STATUS" != "not found" ] && [ -n "$DNS_STATUS" ]; then - log debug "📋 DNSEndpoint status: $DNS_STATUS" - fi - - if nslookup "$SCOPE_DOMAIN" 8.8.8.8 >/dev/null 2>&1; then - log info " ✅ DNS record for $SCOPE_DOMAIN is now resolvable" - RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 2>/dev/null | awk '/^Address/ && !/8\.8\.8\.8/ {print $2; exit}' || echo "unknown") - log info " ✅ Domain $SCOPE_DOMAIN resolves to: $RESOLVED_IP" - break - fi - - log debug "📋 DNS record not yet available, waiting 10s..." - sleep 10 - done - fi + while true; do + iteration=$((iteration + 1)) + if [ $iteration -gt $MAX_ITERATIONS ]; then + log error "" + log error " ❌ DNSEndpoint processing timeout after $((MAX_ITERATIONS * 10))s" + log error "" + log error "💡 Possible causes:" + log error " ExternalDNS may still be processing the DNSEndpoint resource" + log error "" + log error "🔧 How to fix:" + log error " • Check DNSEndpoint resources: kubectl get dnsendpoint -A" + log error " • Check ExternalDNS logs: kubectl logs -n external-dns -l app=external-dns --tail=50" + log error "" + exit 1 + fi + + log debug "🔍 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME (attempt $iteration/$MAX_ITERATIONS)" + + OBSERVED_GEN=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" \ + -o jsonpath='{.status.observedGeneration}' 2>/dev/null || echo "") + + if [ "${OBSERVED_GEN:-0}" -ge 1 ] 2>/dev/null; then + log info " ✅ DNSEndpoint $DNS_ENDPOINT_NAME processed by ExternalDNS (observedGeneration=$OBSERVED_GEN)" + break + fi + + log debug "📋 DNSEndpoint not yet processed, waiting 10s..." + sleep 10 + done log info "" log info "✨ ExternalDNS setup completed successfully" From 83405ddbfff2175c58914a15f575a7a0d24160bf Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Wed, 6 May 2026 11:48:10 -0300 Subject: [PATCH 15/56] fix(dns): address PR review - remove DNS management from deployment verify step and improve endpoint naming - Remove manage_route call and kubectl apply from verify_networking_reconciliation; DNS record creation belongs to scope creation flow, not deployment verification - Include application.slug in DNSEndpoint name (k8s-{app}-{scope}-{id}-dns) to distinguish scopes with the same name across different apps - Truncate app/scope slugs to 20 chars each to respect K8s name length limits - Update dns-endpoint.yaml.tpl to use new naming via gomplate strings.Trunc - Fix wait_on_balancer.bats: rewrite tests to match observedGeneration logic (previous tests referenced removed nslookup checks) - Fix manage_route.bats: correct wrong log message assertions and update expected DNSEndpoint name to new format Co-Authored-By: Claude Sonnet 4.6 --- .../templates/dns-endpoint.yaml.tpl | 2 +- .../verify_networking_reconciliation.bats | 55 ++------ .../verify_networking_reconciliation | 15 -- .../networking/dns/external_dns/manage_route | 5 +- .../dns/external_dns/manage_route.bats | 14 +- k8s/scope/tests/wait_on_balancer.bats | 129 +++++++++--------- k8s/scope/wait_on_balancer | 5 +- 7 files changed, 89 insertions(+), 136 deletions(-) diff --git a/k8s/deployment/templates/dns-endpoint.yaml.tpl b/k8s/deployment/templates/dns-endpoint.yaml.tpl index 1c98848c..0e8ccf45 100644 --- a/k8s/deployment/templates/dns-endpoint.yaml.tpl +++ b/k8s/deployment/templates/dns-endpoint.yaml.tpl @@ -1,7 +1,7 @@ apiVersion: externaldns.k8s.io/v1alpha1 kind: DNSEndpoint metadata: - name: k-8-s-{{ .scope.slug }}-{{ .scope.id }}-dns + name: k8s-{{ .application.slug | strings.Trunc 20 | strings.TrimSuffix "-" }}-{{ .scope.slug | strings.Trunc 20 | strings.TrimSuffix "-" }}-{{ .scope.id }}-dns namespace: {{ .k8s_namespace }} labels: nullplatform: "true" diff --git a/k8s/deployment/tests/verify_networking_reconciliation.bats b/k8s/deployment/tests/verify_networking_reconciliation.bats index 93a7dd82..424a0e10 100644 --- a/k8s/deployment/tests/verify_networking_reconciliation.bats +++ b/k8s/deployment/tests/verify_networking_reconciliation.bats @@ -44,67 +44,30 @@ teardown() { assert_contains "$output" "⚠️ Skipping ALB verification (ALB access needed for blue-green traffic validation)" } -@test "verify_networking_reconciliation: creates DNSEndpoint and verifies HTTPRoute for external_dns" { +@test "verify_networking_reconciliation: verifies HTTPRoute for external_dns without managing DNS" { export DNS_TYPE="external_dns" - export SCOPE_VISIBILITY="internal" - export PRIVATE_GATEWAY_NAME="gateway-private" - export PUBLIC_GATEWAY_NAME="gateway-public" export SCOPE_ID="123" export K8S_NAMESPACE="nullplatform" - export OUTPUT_DIR="$(mktemp -d)" + export INGRESS_VISIBILITY="public" + export MAX_WAIT_SECONDS="10" + export CHECK_INTERVAL="10" export CONTEXT='{"scope":{"slug":"my-app","id":"123","domain":"app.example.com"}}' run bash -c " kubectl() { - if [ \"\$1\" = 'get' ]; then - echo '{\"status\":{\"addresses\":[{\"type\":\"Hostname\",\"value\":\"my-alb.us-east-1.elb.amazonaws.com\"}]}}' - return 0 - fi - if [ \"\$1\" = 'apply' ]; then - echo 'dnsendpoint.externaldns.k8s.io/k-8-s-my-app-123-dns applied' - return 0 - fi - if [ \"\$1\" = 'httproute' ] || [ \"\$2\" = 'httproute' ]; then - echo '{\"status\":{\"parents\":[{\"conditions\":[{\"type\":\"Accepted\",\"status\":\"True\",\"reason\":\"Accepted\"},{\"type\":\"ResolvedRefs\",\"status\":\"True\",\"reason\":\"ResolvedRefs\"}]}]}}' - return 0 - fi + echo '{\"status\":{\"parents\":[{\"conditions\":[{\"type\":\"Accepted\",\"status\":\"True\",\"reason\":\"Accepted\"},{\"type\":\"ResolvedRefs\",\"status\":\"True\",\"reason\":\"ResolvedRefs\"}]}]}}' return 0 } export -f kubectl - gomplate() { echo 'rendered'; return 0; } - export -f gomplate + sleep() { return 0; } + export -f sleep source '$BATS_TEST_DIRNAME/../verify_networking_reconciliation' " [ "$status" -eq 0 ] assert_contains "$output" "🔍 Verifying networking reconciliation for DNS type: external_dns" - assert_contains "$output" "✅ DNSEndpoint applied to cluster" - - rm -rf "$OUTPUT_DIR" -} - -@test "verify_networking_reconciliation: uses public gateway when scope is not internal for external_dns" { - export DNS_TYPE="external_dns" - export SCOPE_VISIBILITY="public" - export PRIVATE_GATEWAY_NAME="gateway-private" - export PUBLIC_GATEWAY_NAME="gateway-public" - export SCOPE_ID="456" - export K8S_NAMESPACE="nullplatform" - export OUTPUT_DIR="$(mktemp -d)" - export CONTEXT='{"scope":{"slug":"my-app","id":"456","domain":"app.example.com"}}' - - run bash -c " - kubectl() { echo '{}'; return 0; } - export -f kubectl - gomplate() { echo 'rendered'; return 0; } - export -f gomplate - GATEWAY_NAME_USED='' - source '$BATS_TEST_DIRNAME/../verify_networking_reconciliation' - " - - assert_contains "$output" "gateway-public" - - rm -rf "$OUTPUT_DIR" + assert_contains "$output" "🔍 Verifying HTTPRoute reconciliation..." + assert_contains "$output" "✅ HTTPRoute successfully reconciled" } @test "verify_networking_reconciliation: skips for unsupported DNS types" { diff --git a/k8s/deployment/verify_networking_reconciliation b/k8s/deployment/verify_networking_reconciliation index dd5927f3..506e57f5 100644 --- a/k8s/deployment/verify_networking_reconciliation +++ b/k8s/deployment/verify_networking_reconciliation @@ -8,21 +8,6 @@ case "$DNS_TYPE" in source "$SERVICE_PATH/deployment/verify_ingress_reconciliation" ;; external_dns) - if [ "${SCOPE_VISIBILITY:-}" = "internal" ]; then - GATEWAY_NAME="${PRIVATE_GATEWAY_NAME:-gateway-private}" - else - GATEWAY_NAME="${PUBLIC_GATEWAY_NAME:-gateway-public}" - fi - - export ACTION="CREATE" GATEWAY_NAME="$GATEWAY_NAME" - source "$SERVICE_PATH/scope/networking/dns/external_dns/manage_route" - - DNS_ENDPOINT_FILE="$OUTPUT_DIR/dns-endpoint-$SCOPE_ID.yaml" - if [ -f "$DNS_ENDPOINT_FILE" ]; then - kubectl apply -f "$DNS_ENDPOINT_FILE" - log info "✅ DNSEndpoint applied to cluster" - fi - source "$SERVICE_PATH/deployment/verify_http_route_reconciliation" ;; *) diff --git a/k8s/scope/networking/dns/external_dns/manage_route b/k8s/scope/networking/dns/external_dns/manage_route index b161123e..204f1dab 100644 --- a/k8s/scope/networking/dns/external_dns/manage_route +++ b/k8s/scope/networking/dns/external_dns/manage_route @@ -95,7 +95,10 @@ elif [ "$ACTION" = "DELETE" ]; then log debug "🔍 Deleting DNSEndpoint for external_dns..." SCOPE_SLUG=$(echo "$CONTEXT" | jq -r '.scope.slug') - DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" + APP_SLUG=$(echo "$CONTEXT" | jq -r '.application.slug') + APP_SLUG_SHORT="${APP_SLUG:0:20}"; APP_SLUG_SHORT="${APP_SLUG_SHORT%-}" + SCOPE_SLUG_SHORT="${SCOPE_SLUG:0:20}"; SCOPE_SLUG_SHORT="${SCOPE_SLUG_SHORT%-}" + DNS_ENDPOINT_NAME="k8s-${APP_SLUG_SHORT}-${SCOPE_SLUG_SHORT}-${SCOPE_ID}-dns" log debug "📝 Deleting DNSEndpoint: $DNS_ENDPOINT_NAME in namespace $K8S_NAMESPACE" kubectl delete dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" || { log warn "⚠️ DNSEndpoint '$DNS_ENDPOINT_NAME' may already be deleted" diff --git a/k8s/scope/tests/networking/dns/external_dns/manage_route.bats b/k8s/scope/tests/networking/dns/external_dns/manage_route.bats index db1563b4..e3db72b0 100644 --- a/k8s/scope/tests/networking/dns/external_dns/manage_route.bats +++ b/k8s/scope/tests/networking/dns/external_dns/manage_route.bats @@ -17,7 +17,7 @@ setup() { export SCOPE_ID="scope-123" export SCOPE_DOMAIN="myapp.example.com" export K8S_NAMESPACE="test-ns" - export CONTEXT='{"scope":{"slug":"my-app"}}' + export CONTEXT='{"scope":{"slug":"my-scope"},"application":{"slug":"my-app"}}' export OUTPUT_DIR="$(mktemp -d)" # Mock kubectl - default: gateway returns IP @@ -70,7 +70,7 @@ teardown() { [ "$status" -eq 0 ] assert_contains "$output" "🔍 Building DNSEndpoint manifest for ExternalDNS..." assert_contains "$output" "📡 Getting IP for gateway: gw-public" - assert_contains "$output" "✅ Gateway IP: 10.0.0.1" + assert_contains "$output" "✅ Gateway address: 10.0.0.1 (recordType: A)" assert_contains "$output" "📝 Building DNSEndpoint from template:" assert_contains "$output" "✅ DNSEndpoint manifest created:" } @@ -98,8 +98,8 @@ teardown() { run bash "$SCRIPT" [ "$status" -eq 0 ] - assert_contains "$output" "⚠️ Gateway IP not found, trying service fallback..." - assert_contains "$output" "✅ Gateway IP: 10.0.0.2" + assert_contains "$output" "⚠️ Gateway hostname not found, trying service fallback..." + assert_contains "$output" "✅ Gateway address: 10.0.0.2 (recordType: A)" } # ============================================================================= @@ -158,7 +158,7 @@ teardown() { [ "$status" -eq 0 ] assert_contains "$output" "🔍 Deleting DNSEndpoint for external_dns..." - assert_contains "$output" "📝 Deleting DNSEndpoint: k-8-s-my-app-scope-123-dns in namespace test-ns" + assert_contains "$output" "📝 Deleting DNSEndpoint: k8s-my-app-my-scope-scope-123-dns in namespace test-ns" assert_contains "$output" "✅ DNSEndpoint deletion completed" } @@ -180,7 +180,7 @@ teardown() { run bash "$SCRIPT" [ "$status" -eq 0 ] - assert_contains "$output" "📝 Deleting DNSEndpoint: k-8-s-my-app-scope-123-dns in namespace test-ns" - assert_contains "$output" "⚠️ DNSEndpoint 'k-8-s-my-app-scope-123-dns' may already be deleted" + assert_contains "$output" "📝 Deleting DNSEndpoint: k8s-my-app-my-scope-scope-123-dns in namespace test-ns" + assert_contains "$output" "⚠️ DNSEndpoint 'k8s-my-app-my-scope-scope-123-dns' may already be deleted" assert_contains "$output" "✅ DNSEndpoint deletion completed" } diff --git a/k8s/scope/tests/wait_on_balancer.bats b/k8s/scope/tests/wait_on_balancer.bats index 4d111db8..83d384d4 100644 --- a/k8s/scope/tests/wait_on_balancer.bats +++ b/k8s/scope/tests/wait_on_balancer.bats @@ -22,6 +22,9 @@ setup() { "id": "scope-123", "slug": "my-scope", "domain": "my-scope.example.com" + }, + "application": { + "slug": "my-app" } }' @@ -31,11 +34,11 @@ setup() { } export -f sleep - # Mock kubectl: DNS endpoint found with status by default + # Mock kubectl: DNSEndpoint found with observedGeneration=1 by default kubectl() { case "$*" in - "get dnsendpoint k-8-s-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status}") - echo '{"observedGeneration":1}' + "get dnsendpoint k8s-my-app-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status.observedGeneration}") + echo "1" return 0 ;; *) @@ -44,29 +47,10 @@ setup() { esac } export -f kubectl - - # Mock nslookup: resolves on first attempt by default - nslookup() { - case "$1" in - "my-scope.example.com") - if [ "$2" = "8.8.8.8" ]; then - echo "Server: 8.8.8.8" - echo "Address: 8.8.8.8#53" - echo "" - echo "Name: my-scope.example.com" - echo "Address: 10.0.0.1" - return 0 - fi - ;; - esac - return 1 - } - export -f nslookup } teardown() { unset -f kubectl - unset -f nslookup unset -f sleep } @@ -79,11 +63,8 @@ teardown() { [ "$status" -eq 0 ] assert_contains "$output" "🔍 Waiting for balancer/DNS setup to complete..." assert_contains "$output" "📋 Checking ExternalDNS record creation for domain: my-scope.example.com" - assert_contains "$output" "🔍 Checking DNS resolution for my-scope.example.com (attempt 1/" - assert_contains "$output" "📋 Checking DNSEndpoint status: k-8-s-my-scope-scope-123-dns" - assert_contains "$output" "📋 DNSEndpoint status:" - assert_contains "$output" "✅ DNS record for my-scope.example.com is now resolvable" - assert_contains "$output" "✅ Domain my-scope.example.com resolves to:" + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns (attempt 1/" + assert_contains "$output" "✅ DNSEndpoint k8s-my-app-my-scope-scope-123-dns processed by ExternalDNS (observedGeneration=1)" assert_contains "$output" "✨ ExternalDNS setup completed successfully" } @@ -91,28 +72,32 @@ teardown() { # external_dns: Success after retries # ============================================================================= @test "wait_on_balancer: external_dns success after retries" { - local attempt=0 - nslookup() { - attempt=$((attempt + 1)) - if [ "$attempt" -ge 2 ] && [ "$1" = "my-scope.example.com" ] && [ "$2" = "8.8.8.8" ]; then - echo "Server: 8.8.8.8" - echo "Address: 8.8.8.8#53" - echo "" - echo "Name: my-scope.example.com" - echo "Address: 10.0.0.1" - return 0 - fi - return 1 + local call_count=0 + kubectl() { + call_count=$((call_count + 1)) + case "$*" in + "get dnsendpoint k8s-my-app-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status.observedGeneration}") + if [ "$call_count" -ge 2 ]; then + echo "1" + return 0 + fi + echo "" + return 0 + ;; + *) + return 0 + ;; + esac } - export -f nslookup + export -f kubectl run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" [ "$status" -eq 0 ] - assert_contains "$output" "🔍 Checking DNS resolution for my-scope.example.com (attempt 1/" - assert_contains "$output" "📋 DNS record not yet available, waiting 10s..." - assert_contains "$output" "🔍 Checking DNS resolution for my-scope.example.com (attempt 2/" - assert_contains "$output" "✅ DNS record for my-scope.example.com is now resolvable" + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns (attempt 1/" + assert_contains "$output" "📋 DNSEndpoint not yet processed, waiting 10s..." + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns (attempt 2/" + assert_contains "$output" "✅ DNSEndpoint k8s-my-app-my-scope-scope-123-dns processed by ExternalDNS (observedGeneration=1)" assert_contains "$output" "✨ ExternalDNS setup completed successfully" } @@ -122,15 +107,16 @@ teardown() { @test "wait_on_balancer: external_dns timeout after MAX_ITERATIONS" { export MAX_ITERATIONS=2 - nslookup() { - return 1 + kubectl() { + echo "" + return 0 } - export -f nslookup + export -f kubectl run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" [ "$status" -eq 1 ] - assert_contains "$output" "❌ DNS record creation timeout after 20s" + assert_contains "$output" "❌ DNSEndpoint processing timeout after 20s" assert_contains "$output" "💡 Possible causes:" assert_contains "$output" "ExternalDNS may still be processing the DNSEndpoint resource" assert_contains "$output" "🔧 How to fix:" @@ -139,35 +125,47 @@ teardown() { } # ============================================================================= -# external_dns: DNS endpoint not found but keeps trying +# external_dns: DNSEndpoint not found - keeps trying until timeout # ============================================================================= -@test "wait_on_balancer: external_dns DNS endpoint not found but keeps trying until resolved" { +@test "wait_on_balancer: external_dns DNS endpoint not found keeps retrying until timeout" { + export MAX_ITERATIONS=2 + kubectl() { - case "$*" in - "get dnsendpoint k-8-s-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status}") - echo "not found" - return 1 - ;; - esac + return 1 } export -f kubectl run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" - [ "$status" -eq 0 ] - assert_contains "$output" "📋 Checking DNSEndpoint status: k-8-s-my-scope-scope-123-dns" - assert_contains "$output" "✅ DNS record for my-scope.example.com is now resolvable" - assert_contains "$output" "✨ ExternalDNS setup completed successfully" + [ "$status" -eq 1 ] + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns" + assert_contains "$output" "📋 DNSEndpoint not yet processed, waiting 10s..." + assert_contains "$output" "❌ DNSEndpoint processing timeout after 20s" } # ============================================================================= -# external_dns: DNS endpoint found with status +# external_dns: APP_SLUG truncated to 20 chars in endpoint name # ============================================================================= -@test "wait_on_balancer: external_dns DNS endpoint found with status is displayed" { +@test "wait_on_balancer: external_dns truncates APP_SLUG to 20 chars in endpoint name" { + export CONTEXT='{ + "scope": { + "id": "123", + "slug": "qa", + "domain": "qa.example.com" + }, + "application": { + "slug": "very-long-application-name-that-exceeds-limit" + } + }' + kubectl() { case "$*" in - "get dnsendpoint k-8-s-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status}") - echo '{"observedGeneration":2}' + "get dnsendpoint k8s-very-long-applicatio-qa-123-dns -n default-namespace -o jsonpath={.status.observedGeneration}") + echo "1" + return 0 + ;; + *) + echo "" return 0 ;; esac @@ -177,7 +175,8 @@ teardown() { run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" [ "$status" -eq 0 ] - assert_contains "$output" '📋 DNSEndpoint status: {"observedGeneration":2}' + assert_contains "$output" "k8s-very-long-applicatio-qa-123-dns" + assert_contains "$output" "✨ ExternalDNS setup completed successfully" } # ============================================================================= diff --git a/k8s/scope/wait_on_balancer b/k8s/scope/wait_on_balancer index d3895725..bde5cfec 100644 --- a/k8s/scope/wait_on_balancer +++ b/k8s/scope/wait_on_balancer @@ -11,7 +11,10 @@ case "$DNS_TYPE" in SCOPE_DOMAIN=$(echo "$CONTEXT" | jq -r '.scope.domain') SCOPE_SLUG=$(echo "$CONTEXT" | jq -r '.scope.slug') SCOPE_ID=$(echo "$CONTEXT" | jq -r '.scope.id') - DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" + APP_SLUG=$(echo "$CONTEXT" | jq -r '.application.slug') + APP_SLUG_SHORT="${APP_SLUG:0:20}"; APP_SLUG_SHORT="${APP_SLUG_SHORT%-}" + SCOPE_SLUG_SHORT="${SCOPE_SLUG:0:20}"; SCOPE_SLUG_SHORT="${SCOPE_SLUG_SHORT%-}" + DNS_ENDPOINT_NAME="k8s-${APP_SLUG_SHORT}-${SCOPE_SLUG_SHORT}-${SCOPE_ID}-dns" log debug "📋 Checking ExternalDNS record creation for domain: $SCOPE_DOMAIN" From 354b47ce2e98ebca0e16b33dc93dbe4e7e2e9e44 Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Fri, 8 May 2026 11:51:57 -0300 Subject: [PATCH 16/56] chore(changelog): add 1.12.0 entry for external-dns DNSEndpoint support Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e0f4f23..9aa332d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.11.1] - 2026-05-08 +- Fix external-dns DNSEndpoint creation when `DNS_TYPE=external_dns` +- Add `dns/zone-type` label to DNSEndpoints to prevent cross-zone record creation +- Fix ALB hostname resolution for external-dns on AWS +- Fix DNS propagation detection using `observedGeneration` for external-dns scopes + ## [1.11.0] - 2026-04-16 - Add unit testing support - Add scope configuration From d6c24ade9cf15b9307b6796a7e5b5ba973aade74 Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 11 May 2026 10:32:35 -0300 Subject: [PATCH 17/56] chore(changelog): condense 1.11.1 entry into single functional description Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9aa332d6..eadb747a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [1.11.1] - 2026-05-08 -- Fix external-dns DNSEndpoint creation when `DNS_TYPE=external_dns` -- Add `dns/zone-type` label to DNSEndpoints to prevent cross-zone record creation -- Fix ALB hostname resolution for external-dns on AWS -- Fix DNS propagation detection using `observedGeneration` for external-dns scopes +- Use DNSEndpoint CRD for DNS management when `DNS_TYPE=external_dns`, with zone-type label filtering to isolate public/private Route53 records ## [1.11.0] - 2026-04-16 - Add unit testing support From fa6b7bad3611bd3f8a7fd7a61472fce774c3615f Mon Sep 17 00:00:00 2001 From: sebas_correa Date: Mon, 11 May 2026 13:57:52 -0300 Subject: [PATCH 18/56] chore(changelog): reframe 1.11.1 entry around user benefit Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eadb747a..e658b28d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [1.11.1] - 2026-05-08 -- Use DNSEndpoint CRD for DNS management when `DNS_TYPE=external_dns`, with zone-type label filtering to isolate public/private Route53 records +- Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage ## [1.11.0] - 2026-04-16 - Add unit testing support From c0df99c191b4bdd2ba2f334ebf41ceb3d96ad823 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Tue, 5 May 2026 10:40:21 -0300 Subject: [PATCH 19/56] feat(k8s): configurable main_http_port and HTTP support for additional_ports --- CHANGELOG.md | 3 +- k8s/deployment/build_context | 8 +- .../templates/blue-green-ingress.yaml.tpl | 4 +- k8s/deployment/templates/deployment.yaml.tpl | 79 ++++++++++++++++--- .../templates/initial-ingress.yaml.tpl | 4 +- .../istio/blue-green-httproute.yaml.tpl | 4 +- .../istio/initial-httproute.yaml.tpl | 2 +- .../templates/istio/service.yaml.tpl | 2 +- k8s/deployment/templates/service.yaml.tpl | 2 +- k8s/deployment/tests/build_context.bats | 28 +++++++ k8s/docs/configurable-http-ports.md | 59 ++++++++++++++ k8s/specs/service-spec.json.tpl | 19 ++++- 12 files changed, 191 insertions(+), 23 deletions(-) create mode 100644 k8s/docs/configurable-http-ports.md diff --git a/CHANGELOG.md b/CHANGELOG.md index e658b28d..0299d06b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [1.11.1] - 2026-05-08 +## [Unreleased] - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage +- Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports ## [1.11.0] - 2026-04-16 - Add unit testing support diff --git a/k8s/deployment/build_context b/k8s/deployment/build_context index 0808681b..e15054c3 100755 --- a/k8s/deployment/build_context +++ b/k8s/deployment/build_context @@ -245,6 +245,10 @@ if [[ -n "$TRAFFIC_MANAGER_CONFIG_MAP" ]]; then log info "✨ ConfigMap '$TRAFFIC_MANAGER_CONFIG_MAP' validation successful" fi +# Main HTTP port — defaults to 8080 if not set on the scope (CLIEN-739) +MAIN_HTTP_PORT=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') +log debug "🔍 main_http_port resolved to ${MAIN_HTTP_PORT}" + # Check if blue deployment has K8s services for additional ports BLUE_ADDITIONAL_PORT_SERVICES="{}" if [ -n "$BLUE_DEPLOYMENT_ID" ] && [ "$BLUE_DEPLOYMENT_ID" != "null" ]; then @@ -280,6 +284,7 @@ CONTEXT=$(echo "$CONTEXT" | jq \ --arg container_memory_in_memory "$CONTAINER_MEMORY_IN_MEMORY" \ --arg container_cpu_in_millicores "$CONTAINER_CPU_IN_MILLICORES" \ --argjson blue_additional_port_services "$BLUE_ADDITIONAL_PORT_SERVICES" \ + --arg main_http_port "$MAIN_HTTP_PORT" \ '. + {blue_deployment_id: $blue_deployment_id, blue_replicas: $blue_replicas, green_replicas: $green_replicas, @@ -292,7 +297,8 @@ CONTEXT=$(echo "$CONTEXT" | jq \ traffic_manager_config_map: $traffic_manager_config_map, container_memory_in_memory: $container_memory_in_memory, container_cpu_in_millicores: $container_cpu_in_millicores, - blue_additional_port_services: $blue_additional_port_services + blue_additional_port_services: $blue_additional_port_services, + main_http_port: ($main_http_port | tonumber) }') DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.deployment.id') diff --git a/k8s/deployment/templates/blue-green-ingress.yaml.tpl b/k8s/deployment/templates/blue-green-ingress.yaml.tpl index 20a0a5b0..18504151 100644 --- a/k8s/deployment/templates/blue-green-ingress.yaml.tpl +++ b/k8s/deployment/templates/blue-green-ingress.yaml.tpl @@ -30,8 +30,8 @@ metadata: annotations: alb.ingress.kubernetes.io/actions.bg-deployment: >- {"type":"forward","forwardConfig":{"targetGroups":[ - {"serviceName":"d-{{ .scope.id }}-{{ .blue_deployment_id }}","servicePort":8080,"weight":{{ sub 100 .deployment.strategy_data.desired_switched_traffic }}}, - {"serviceName":"d-{{ .scope.id }}-{{ .deployment.id }}","servicePort":8080,"weight":{{ .deployment.strategy_data.desired_switched_traffic }}} + {"serviceName":"d-{{ .scope.id }}-{{ .blue_deployment_id }}","servicePort":{{ .main_http_port }},"weight":{{ sub 100 .deployment.strategy_data.desired_switched_traffic }}}, + {"serviceName":"d-{{ .scope.id }}-{{ .deployment.id }}","servicePort":{{ .main_http_port }},"weight":{{ .deployment.strategy_data.desired_switched_traffic }}} ]}} alb.ingress.kubernetes.io/actions.response-404: '{"type":"fixed-response","fixedResponseConfig":{"contentType":"text/plain","statusCode":"404","messageBody":"404 scope not found or has not been deployed yet"}}' alb.ingress.kubernetes.io/group.name: {{ .alb_name }} diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 6acf1c95..c13d6cb3 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -135,6 +135,8 @@ spec: - containerPort: 80 protocol: TCP env: + - name: UPSTREAM_PORT + value: '{{ .main_http_port }}' - name: HEALTH_CHECK_TYPE value: http - name: GRACE_PERIOD @@ -151,7 +153,7 @@ spec: cpu: 31m livenessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" 8080 }} + {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" .main_http_port }} {{- else }} {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 80 }} {{- end }} @@ -159,7 +161,7 @@ spec: failureThreshold: 9 readinessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" 8080 }} + {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" .main_http_port }} {{- else }} {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 80 }} {{- end }} @@ -167,7 +169,7 @@ spec: failureThreshold: 3 startupProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" 8080 }} + {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" .main_http_port }} {{- else }} {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 80 }} {{- end }} @@ -229,6 +231,63 @@ spec: terminationMessagePath: /dev/termination-log terminationMessagePolicy: File imagePullPolicy: Always + {{ else if eq .type "HTTP" }} + - name: http-{{ .port }} + securityContext: + runAsUser: 0 + image: {{ $.traffic_image }} + ports: + - containerPort: {{ .port }} + protocol: TCP + env: + - name: UPSTREAM_PORT + value: '{{ .port }}' + - name: HEALTH_CHECK_TYPE + value: http + - name: GRACE_PERIOD + value: '15' + - name: LISTENER_PROTOCOL + value: http + - name: LISTENER_PORT + value: '{{ .port }}' + - name: HEALTH_CHECK_PATH + value: {{ $.scope.capabilities.health_check.path }} + resources: + limits: + cpu: {{ $.container_cpu_in_millicores }}m + memory: {{ $.container_memory_in_memory }}Mi + requests: + cpu: 31m + livenessProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 9 + readinessProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 90 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + imagePullPolicy: Always {{ end }} {{ end }} {{ end }} @@ -241,7 +300,7 @@ spec: securityContext: runAsUser: 0 ports: - - containerPort: 8080 + - containerPort: {{ .main_http_port }} protocol: TCP {{ if .scope.capabilities.additional_ports }} {{ range .scope.capabilities.additional_ports }} @@ -258,25 +317,25 @@ spec: memory: {{ .scope.capabilities.ram_memory }}Mi livenessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.app_tcp" dict "port" 8080 }} + {{- template "probe.app_tcp" dict "port" .main_http_port }} {{- else }} - {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 8080 }} + {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" .main_http_port }} {{- end }} {{- template "probe.base" dict "healthCheck" .scope.capabilities.health_check }} failureThreshold: 6 readinessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.app_tcp" dict "port" 8080 }} + {{- template "probe.app_tcp" dict "port" .main_http_port }} {{- else }} - {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 8080 }} + {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" .main_http_port }} {{- end }} {{- template "probe.base" dict "healthCheck" .scope.capabilities.health_check }} failureThreshold: 3 startupProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.app_tcp" dict "port" 8080 }} + {{- template "probe.app_tcp" dict "port" .main_http_port }} {{- else }} - {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 8080 }} + {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" .main_http_port }} {{- end }} {{- template "probe.base" dict "healthCheck" .scope.capabilities.health_check }} failureThreshold: 90 diff --git a/k8s/deployment/templates/initial-ingress.yaml.tpl b/k8s/deployment/templates/initial-ingress.yaml.tpl index d2d099ac..d68e7e36 100644 --- a/k8s/deployment/templates/initial-ingress.yaml.tpl +++ b/k8s/deployment/templates/initial-ingress.yaml.tpl @@ -62,7 +62,7 @@ spec: service: name: d-{{ .scope.id }}-{{ .deployment.id }} port: - number: 8080 + number: {{ .main_http_port }} {{- range .scope.domains }} - host: {{ .name }} http: @@ -73,7 +73,7 @@ spec: service: name: d-{{ $.scope.id }}-{{ $.deployment.id }} port: - number: 8080 + number: {{ $.main_http_port }} {{- end }} {{ if .scope.capabilities.additional_ports }} {{ range .scope.capabilities.additional_ports }} diff --git a/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl b/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl index 5f45ad58..d3beef1e 100644 --- a/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl +++ b/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl @@ -59,13 +59,13 @@ spec: - group: "" kind: Service name: d-{{ .scope.id }}-{{ .blue_deployment_id }} - port: 8080 + port: {{ .main_http_port }} weight: {{ sub 100 .deployment.strategy_data.desired_switched_traffic }} # Green deployment (new version) - group: "" kind: Service name: d-{{ .scope.id }}-{{ .deployment.id }} - port: 8080 + port: {{ .main_http_port }} weight: {{ .deployment.strategy_data.desired_switched_traffic }} matches: - path: diff --git a/k8s/deployment/templates/istio/initial-httproute.yaml.tpl b/k8s/deployment/templates/istio/initial-httproute.yaml.tpl index f300a5d3..245e414e 100644 --- a/k8s/deployment/templates/istio/initial-httproute.yaml.tpl +++ b/k8s/deployment/templates/istio/initial-httproute.yaml.tpl @@ -58,7 +58,7 @@ spec: - group: "" kind: Service name: d-{{ .scope.id }}-{{ .deployment.id }} - port: 8080 + port: {{ .main_http_port }} weight: 1 matches: - path: diff --git a/k8s/deployment/templates/istio/service.yaml.tpl b/k8s/deployment/templates/istio/service.yaml.tpl index 051579e4..5a055581 100644 --- a/k8s/deployment/templates/istio/service.yaml.tpl +++ b/k8s/deployment/templates/istio/service.yaml.tpl @@ -41,7 +41,7 @@ metadata: spec: ports: - protocol: TCP - port: 8080 + port: {{ .main_http_port }} targetPort: 80 selector: nullplatform: "true" diff --git a/k8s/deployment/templates/service.yaml.tpl b/k8s/deployment/templates/service.yaml.tpl index a9299fb3..4c7c3af1 100644 --- a/k8s/deployment/templates/service.yaml.tpl +++ b/k8s/deployment/templates/service.yaml.tpl @@ -57,7 +57,7 @@ metadata: spec: ports: - protocol: TCP - port: 8080 + port: {{ .main_http_port }} targetPort: 80 selector: nullplatform: "true" diff --git a/k8s/deployment/tests/build_context.bats b/k8s/deployment/tests/build_context.bats index ce8aa579..dfd57700 100644 --- a/k8s/deployment/tests/build_context.bats +++ b/k8s/deployment/tests/build_context.bats @@ -800,3 +800,31 @@ SCRIPT assert_equal "$grpc_exists" "true" assert_equal "$http_exists" "false" } + +# ============================================================================= +# main_http_port extraction tests (CLIEN-739) +# ============================================================================= + +@test "main_http_port: defaults to 8080 when capability missing" { + CONTEXT='{"scope":{"capabilities":{}}}' + result=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') + assert_equal "$result" "8080" +} + +@test "main_http_port: defaults to 8080 when capability is null" { + CONTEXT='{"scope":{"capabilities":{"main_http_port":null}}}' + result=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') + assert_equal "$result" "8080" +} + +@test "main_http_port: respects explicit value when set" { + CONTEXT='{"scope":{"capabilities":{"main_http_port":9090}}}' + result=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') + assert_equal "$result" "9090" +} + +@test "main_http_port: jq cast to number preserves integer type for templates" { + MAIN_HTTP_PORT="9090" + result=$(echo '{}' | jq --arg main_http_port "$MAIN_HTTP_PORT" '. + {main_http_port: ($main_http_port | tonumber)} | .main_http_port') + assert_equal "$result" "9090" +} diff --git a/k8s/docs/configurable-http-ports.md b/k8s/docs/configurable-http-ports.md new file mode 100644 index 00000000..699092a3 --- /dev/null +++ b/k8s/docs/configurable-http-ports.md @@ -0,0 +1,59 @@ +# Configurable HTTP Ports + +The k8s scope supports configuring the port on which the application's main HTTP listener binds, and exposing additional HTTP ports as siblings of the main listener. + +## Capabilities + +### `main_http_port` + +- **Type:** integer +- **Default:** `8080` +- **Range:** 1024 – 65535 +- **Required:** yes (with default — the form pre-fills 8080) + +The port your application binds to inside the container. When set, the following are derived from it automatically: + +| Resource | Field | Value | +|---|---|---| +| `Deployment` (application container) | `containerPort` | `main_http_port` | +| `Deployment` (application container) | livenessProbe / readinessProbe / startupProbe port | `main_http_port` | +| `Deployment` (http traffic-manager sidecar) | `UPSTREAM_PORT` env | `main_http_port` | +| `Deployment` (http traffic-manager sidecar) | TCP probe `app_port` | `main_http_port` | +| `Service` | `port` (cluster-public) | `main_http_port` | +| `Ingress` (initial and blue-green) | backend service port | `main_http_port` | +| Istio `Service` and `HTTPRoute` | port | `main_http_port` | + +`Service.targetPort` stays `80` because that is the sidecar's port, not the app's. + +### `additional_ports[].type = "HTTP"` + +`additional_ports` is a list of extra ports the scope exposes alongside the main HTTP listener. Each item has: + +- `port`: integer 1024–65535 +- `type`: `"GRPC"` or `"HTTP"` + +For `HTTP` ports, the deployment generates: + +- A traffic-manager sidecar named `http-{port}` listening on `{port}` and proxying to the application on the same `{port}`. +- A `Service` named `d-{scope_id}-{deployment_id}-http-{port}` exposing `{port}`. +- An `Ingress` for the additional HTTP listener. + +For `GRPC` ports, the existing gRPC sidecar pattern is unchanged. + +## Backward Compatibility + +- Existing scopes that do not set `main_http_port` get `8080` automatically via the JSON Schema default and the `// 8080` jq fallback in `build_context`. No migration is required. +- The `traffic-manager` image's `start.sh` defaults `UPSTREAM_PORT` to `8080` when the env is not provided, so an upgraded image with un-upgraded scope templates continues to behave like the old image. +- Adding `HTTP` to the `additional_ports.type` enum is strictly additive — existing entries with `"GRPC"` remain valid. + +## Implementation Map + +- JSON Schema and UI Schema: `k8s/specs/service-spec.json.tpl` +- Build context extraction: `k8s/deployment/build_context` (look for `MAIN_HTTP_PORT`) +- Templates that consume `main_http_port`: `k8s/deployment/templates/{service,deployment,initial-ingress,blue-green-ingress}.yaml.tpl` and `k8s/deployment/templates/istio/*.tpl` +- HTTP additional_ports sidecar: `k8s/deployment/templates/deployment.yaml.tpl` (look for `else if eq .type "HTTP"`) +- traffic-manager image: `nullplatform/k8s-tools/traffic-manager` — `UPSTREAM_PORT` env handled in `start.sh` + +## Tests + +- `k8s/deployment/tests/build_context.bats` covers `main_http_port` extraction with present, absent, and `null` cases, and verifies the `tonumber` cast. diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index f331df10..9da67107 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -13,6 +13,7 @@ "fixed_instances", "scheduled_stop", "additional_ports", + "main_http_port", "protocol", "continuous_delivery" ], @@ -125,8 +126,13 @@ }, { "type":"Category", - "label":"Additional Ports", + "label":"Exposed Ports", "elements":[ + { + "type":"Control", + "label":"Main HTTP Port", + "scope":"#/properties/main_http_port" + }, { "type":"Control", "scope":"#/properties/additional_ports", @@ -537,6 +543,14 @@ "minimum":1, "description":"Fixed number of instances to run" }, + "main_http_port":{ + "type":"integer", + "title":"Main HTTP Port", + "default":8080, + "minimum":1024, + "maximum":65535, + "description":"Port where your application's main HTTP listener binds. Default 8080." + }, "additional_ports":{ "type":"array", "items":{ @@ -555,7 +569,8 @@ }, "type":{ "enum":[ - "GRPC" + "GRPC", + "HTTP" ], "type":"string", "title":"Port Type", From 7bd2a5732bfde99887a781551608114164f906be Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 6 May 2026 00:59:11 -0300 Subject: [PATCH 20/56] Additional port default is http --- k8s/specs/service-spec.json.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index 9da67107..656e641d 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -574,7 +574,7 @@ ], "type":"string", "title":"Port Type", - "default": "GRPC", + "default": "HTTP", "description":"The protocol type for this port" } } From 420c39cb9c7b9ad8b267f9170298a6158a9737a1 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 6 May 2026 01:06:09 -0300 Subject: [PATCH 21/56] fix(k8s/templates): use root context for k8s_modifiers in HTTP additional_port service --- k8s/deployment/templates/service.yaml.tpl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/k8s/deployment/templates/service.yaml.tpl b/k8s/deployment/templates/service.yaml.tpl index 4c7c3af1..ec3bbb99 100644 --- a/k8s/deployment/templates/service.yaml.tpl +++ b/k8s/deployment/templates/service.yaml.tpl @@ -101,14 +101,14 @@ metadata: scope: {{ $.scope.slug }} scope_id: "{{ $.scope.id }}" deployment_id: "{{ $.deployment.id }}" -{{- $global := index .k8s_modifiers "global" }} +{{- $global := index $.k8s_modifiers "global" }} {{- if $global }} {{- $labels := index $global "labels" }} {{- if $labels }} {{ data.ToYAML $labels | indent 4 }} {{- end }} {{- end }} -{{- $service := index .k8s_modifiers "service" }} +{{- $service := index $.k8s_modifiers "service" }} {{- if $service }} {{- $labels := index $service "labels" }} {{- if $labels }} @@ -124,14 +124,14 @@ metadata: alb.ingress.kubernetes.io/success-codes: 200-299 alb.ingress.kubernetes.io/unhealthy-threshold-count: '3' alb.ingress.kubernetes.io/backend-protocol: HTTP -{{- $global := index .k8s_modifiers "global" }} +{{- $global := index $.k8s_modifiers "global" }} {{- if $global }} {{- $annotations := index $global "annotations" }} {{- if $annotations }} {{ data.ToYAML $annotations | indent 4 }} {{- end }} {{- end }} -{{- $service := index .k8s_modifiers "service" }} +{{- $service := index $.k8s_modifiers "service" }} {{- if $service }} {{- $annotations := index $service "annotations" }} {{- if $annotations }} From 080827f880b646fbb01b3ce8db8423f3bf5c1598 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 6 May 2026 11:53:56 -0300 Subject: [PATCH 22/56] fix(k8s/templates): remove HTTP sidecar; HTTP additional ports are app-bound directly --- k8s/deployment/templates/deployment.yaml.tpl | 59 +------------------- k8s/docs/configurable-http-ports.md | 13 ++++- 2 files changed, 12 insertions(+), 60 deletions(-) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index c13d6cb3..0d58812a 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -231,63 +231,6 @@ spec: terminationMessagePath: /dev/termination-log terminationMessagePolicy: File imagePullPolicy: Always - {{ else if eq .type "HTTP" }} - - name: http-{{ .port }} - securityContext: - runAsUser: 0 - image: {{ $.traffic_image }} - ports: - - containerPort: {{ .port }} - protocol: TCP - env: - - name: UPSTREAM_PORT - value: '{{ .port }}' - - name: HEALTH_CHECK_TYPE - value: http - - name: GRACE_PERIOD - value: '15' - - name: LISTENER_PROTOCOL - value: http - - name: LISTENER_PORT - value: '{{ .port }}' - - name: HEALTH_CHECK_PATH - value: {{ $.scope.capabilities.health_check.path }} - resources: - limits: - cpu: {{ $.container_cpu_in_millicores }}m - memory: {{ $.container_memory_in_memory }}Mi - requests: - cpu: 31m - livenessProbe: - httpGet: - path: {{ $.scope.capabilities.health_check.path }} - port: {{ .port }} - timeoutSeconds: 5 - periodSeconds: 10 - initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} - successThreshold: 1 - failureThreshold: 9 - readinessProbe: - httpGet: - path: {{ $.scope.capabilities.health_check.path }} - port: {{ .port }} - timeoutSeconds: 5 - periodSeconds: 10 - initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} - successThreshold: 1 - failureThreshold: 3 - startupProbe: - httpGet: - path: {{ $.scope.capabilities.health_check.path }} - port: {{ .port }} - timeoutSeconds: 5 - periodSeconds: 10 - initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} - successThreshold: 1 - failureThreshold: 90 - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - imagePullPolicy: Always {{ end }} {{ end }} {{ end }} @@ -304,10 +247,12 @@ spec: protocol: TCP {{ if .scope.capabilities.additional_ports }} {{ range .scope.capabilities.additional_ports }} + {{ if eq .type "HTTP" }} - containerPort: {{ .port }} protocol: TCP {{ end }} {{ end }} + {{ end }} resources: limits: cpu: {{ .scope.capabilities.cpu_millicores }}m diff --git a/k8s/docs/configurable-http-ports.md b/k8s/docs/configurable-http-ports.md index 699092a3..4db4baa8 100644 --- a/k8s/docs/configurable-http-ports.md +++ b/k8s/docs/configurable-http-ports.md @@ -34,11 +34,18 @@ The port your application binds to inside the container. When set, the following For `HTTP` ports, the deployment generates: -- A traffic-manager sidecar named `http-{port}` listening on `{port}` and proxying to the application on the same `{port}`. -- A `Service` named `d-{scope_id}-{deployment_id}-http-{port}` exposing `{port}`. +- A `containerPort: {port}` declaration on the application container — **the application is expected to bind this port directly**. No sidecar is involved. +- A `Service` named `d-{scope_id}-{deployment_id}-http-{port}` with `targetPort: {port}` that routes external traffic to the application's port. - An `Ingress` for the additional HTTP listener. -For `GRPC` ports, the existing gRPC sidecar pattern is unchanged. +For `GRPC` ports, the existing gRPC sidecar pattern is unchanged: a `grpc-{port}` traffic-manager sidecar terminates gRPC on `{port}` and proxies HTTP to the application's `main_http_port`. The application does NOT bind gRPC additional ports — the sidecar does — which is why the protocol distinction matters. + +| | HTTP additional port | GRPC additional port | +|---|---|---| +| App binds the port | yes | no (sidecar binds it) | +| Sidecar created | no | yes (`grpc-{port}` traffic-manager) | +| Service `targetPort` | `{port}` (the app) | `{port}` (the sidecar) | +| Protocol translation | none | gRPC → HTTP to app on `main_http_port` | ## Backward Compatibility From 492841aa5fba1b75fad083a21af127eb47288f13 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 6 May 2026 13:59:34 -0300 Subject: [PATCH 23/56] fix(k8s/templates): restore HTTP additional-port sidecar with UPSTREAM_PORT pointing to main_http_port --- k8s/deployment/templates/deployment.yaml.tpl | 65 +++++++++++++++++--- k8s/docs/configurable-http-ports.md | 23 ++++--- 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 0d58812a..660e899f 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -231,6 +231,63 @@ spec: terminationMessagePath: /dev/termination-log terminationMessagePolicy: File imagePullPolicy: Always + {{ else if eq .type "HTTP" }} + - name: http-{{ .port }} + securityContext: + runAsUser: 0 + image: {{ $.traffic_image }} + ports: + - containerPort: {{ .port }} + protocol: TCP + env: + - name: UPSTREAM_PORT + value: '{{ $.main_http_port }}' + - name: HEALTH_CHECK_TYPE + value: http + - name: GRACE_PERIOD + value: '15' + - name: LISTENER_PROTOCOL + value: http + - name: LISTENER_PORT + value: '{{ .port }}' + - name: HEALTH_CHECK_PATH + value: {{ $.scope.capabilities.health_check.path }} + resources: + limits: + cpu: {{ $.container_cpu_in_millicores }}m + memory: {{ $.container_memory_in_memory }}Mi + requests: + cpu: 31m + livenessProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 9 + readinessProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 90 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + imagePullPolicy: Always {{ end }} {{ end }} {{ end }} @@ -245,14 +302,6 @@ spec: ports: - containerPort: {{ .main_http_port }} protocol: TCP - {{ if .scope.capabilities.additional_ports }} - {{ range .scope.capabilities.additional_ports }} - {{ if eq .type "HTTP" }} - - containerPort: {{ .port }} - protocol: TCP - {{ end }} - {{ end }} - {{ end }} resources: limits: cpu: {{ .scope.capabilities.cpu_millicores }}m diff --git a/k8s/docs/configurable-http-ports.md b/k8s/docs/configurable-http-ports.md index 4db4baa8..6b9a80f5 100644 --- a/k8s/docs/configurable-http-ports.md +++ b/k8s/docs/configurable-http-ports.md @@ -32,20 +32,25 @@ The port your application binds to inside the container. When set, the following - `port`: integer 1024–65535 - `type`: `"GRPC"` or `"HTTP"` -For `HTTP` ports, the deployment generates: +For both `HTTP` and `GRPC` additional ports, the deployment generates: -- A `containerPort: {port}` declaration on the application container — **the application is expected to bind this port directly**. No sidecar is involved. -- A `Service` named `d-{scope_id}-{deployment_id}-http-{port}` with `targetPort: {port}` that routes external traffic to the application's port. -- An `Ingress` for the additional HTTP listener. +- A traffic-manager sidecar that binds the additional port externally and proxies traffic to the application on its `main_http_port`. The container is named `http-{port}` for HTTP and `grpc-{port}` for GRPC. +- A `Service` named `d-{scope_id}-{deployment_id}-{http|grpc}-{port}` with `targetPort: {port}` that routes external traffic to the sidecar. +- An `Ingress` for the additional listener. -For `GRPC` ports, the existing gRPC sidecar pattern is unchanged: a `grpc-{port}` traffic-manager sidecar terminates gRPC on `{port}` and proxies HTTP to the application's `main_http_port`. The application does NOT bind gRPC additional ports — the sidecar does — which is why the protocol distinction matters. +**Important contract:** the application **must NOT bind additional ports** itself. The application binds only `main_http_port`. The sidecar at `{port}` proxies all traffic to `localhost:main_http_port`, where the application serves requests. This is identical to the existing gRPC pattern, just extended to HTTP. + +The sidecar is not a no-op pass-through — it provides nginx-level metrics, graceful-shutdown handling, body-size limits, and protocol translation (for gRPC). Removing it would lose those features. | | HTTP additional port | GRPC additional port | |---|---|---| -| App binds the port | yes | no (sidecar binds it) | -| Sidecar created | no | yes (`grpc-{port}` traffic-manager) | -| Service `targetPort` | `{port}` (the app) | `{port}` (the sidecar) | -| Protocol translation | none | gRPC → HTTP to app on `main_http_port` | +| App binds the port | no (sidecar binds it) | no (sidecar binds it) | +| Sidecar created | yes (`http-{port}` traffic-manager) | yes (`grpc-{port}` traffic-manager) | +| Service `targetPort` | `{port}` (the sidecar) | `{port}` (the sidecar) | +| Sidecar `UPSTREAM_PORT` | `main_http_port` | `main_http_port` (default in image) | +| Protocol translation | none (HTTP→HTTP) | gRPC → HTTP | + +If your application code currently binds an additional port directly (e.g., `app.listen(9090)`), remove that listener — nullplatform's sidecar handles the external binding. Your app will receive requests for the additional port on its `main_http_port` listener. ## Backward Compatibility From 39d4856a369ceb5f199bf271d68eb44f0aadd940 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 8 May 2026 00:03:34 -0300 Subject: [PATCH 24/56] fix(k8s/templates): HTTP sidecar binds port+10000 so app can bind port directly --- k8s/deployment/templates/deployment.yaml.tpl | 20 +++++--- k8s/deployment/templates/service.yaml.tpl | 2 +- k8s/docs/configurable-http-ports.md | 49 ++++++++++++++------ 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 660e899f..5e1f6368 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -237,11 +237,11 @@ spec: runAsUser: 0 image: {{ $.traffic_image }} ports: - - containerPort: {{ .port }} + - containerPort: {{ add .port 10000 }} protocol: TCP env: - name: UPSTREAM_PORT - value: '{{ $.main_http_port }}' + value: '{{ .port }}' - name: HEALTH_CHECK_TYPE value: http - name: GRACE_PERIOD @@ -249,7 +249,7 @@ spec: - name: LISTENER_PROTOCOL value: http - name: LISTENER_PORT - value: '{{ .port }}' + value: '{{ add .port 10000 }}' - name: HEALTH_CHECK_PATH value: {{ $.scope.capabilities.health_check.path }} resources: @@ -261,7 +261,7 @@ spec: livenessProbe: httpGet: path: {{ $.scope.capabilities.health_check.path }} - port: {{ .port }} + port: {{ add .port 10000 }} timeoutSeconds: 5 periodSeconds: 10 initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} @@ -270,7 +270,7 @@ spec: readinessProbe: httpGet: path: {{ $.scope.capabilities.health_check.path }} - port: {{ .port }} + port: {{ add .port 10000 }} timeoutSeconds: 5 periodSeconds: 10 initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} @@ -279,7 +279,7 @@ spec: startupProbe: httpGet: path: {{ $.scope.capabilities.health_check.path }} - port: {{ .port }} + port: {{ add .port 10000 }} timeoutSeconds: 5 periodSeconds: 10 initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} @@ -302,6 +302,14 @@ spec: ports: - containerPort: {{ .main_http_port }} protocol: TCP + {{ if .scope.capabilities.additional_ports }} + {{ range .scope.capabilities.additional_ports }} + {{ if eq .type "HTTP" }} + - containerPort: {{ .port }} + protocol: TCP + {{ end }} + {{ end }} + {{ end }} resources: limits: cpu: {{ .scope.capabilities.cpu_millicores }}m diff --git a/k8s/deployment/templates/service.yaml.tpl b/k8s/deployment/templates/service.yaml.tpl index ec3bbb99..0bb36b44 100644 --- a/k8s/deployment/templates/service.yaml.tpl +++ b/k8s/deployment/templates/service.yaml.tpl @@ -142,7 +142,7 @@ spec: ports: - protocol: TCP port: {{ .port }} - targetPort: {{ .port }} + targetPort: {{ add .port 10000 }} selector: nullplatform: "true" account: {{ $.account.slug }} diff --git a/k8s/docs/configurable-http-ports.md b/k8s/docs/configurable-http-ports.md index 6b9a80f5..c09b9e98 100644 --- a/k8s/docs/configurable-http-ports.md +++ b/k8s/docs/configurable-http-ports.md @@ -32,25 +32,48 @@ The port your application binds to inside the container. When set, the following - `port`: integer 1024–65535 - `type`: `"GRPC"` or `"HTTP"` -For both `HTTP` and `GRPC` additional ports, the deployment generates: +For each additional port (HTTP or GRPC), the deployment generates a traffic-manager sidecar that handles external traffic. The sidecar is **always** in the request path: it adds nginx-level metrics, graceful-shutdown handling, and body-size limits. -- A traffic-manager sidecar that binds the additional port externally and proxies traffic to the application on its `main_http_port`. The container is named `http-{port}` for HTTP and `grpc-{port}` for GRPC. -- A `Service` named `d-{scope_id}-{deployment_id}-{http|grpc}-{port}` with `targetPort: {port}` that routes external traffic to the sidecar. -- An `Ingress` for the additional listener. +The architecture differs slightly between HTTP and GRPC because of how the application is expected to bind ports: -**Important contract:** the application **must NOT bind additional ports** itself. The application binds only `main_http_port`. The sidecar at `{port}` proxies all traffic to `localhost:main_http_port`, where the application serves requests. This is identical to the existing gRPC pattern, just extended to HTTP. +### HTTP additional port — same model as `main_http_port` -The sidecar is not a no-op pass-through — it provides nginx-level metrics, graceful-shutdown handling, body-size limits, and protocol translation (for gRPC). Removing it would lose those features. +The application **binds the additional port directly** (e.g., `app.listen(9090)`), exactly the way it binds `main_http_port`. The sidecar bindes a different *internal* port, `port + 10000`, to avoid colliding with the application. K8s `Service` exposes `port` externally and routes to the sidecar's internal port; the sidecar then proxies to the application on `port`. + +For example, with `main_http_port=8081` and `additional_port: {port: 9090, type: HTTP}`: + +``` +External client + │ http://service:9090 + ▼ +K8s Service "d-{scope}-{deploy}-http-9090" port: 9090, targetPort: 19090 + │ + ▼ +Sidecar container "http-9090" listens on 19090 → proxies to localhost:9090 + │ + ▼ +Application container binds 9090 (and also 8081 for the main listener) +``` + +The application sees two real listeners: `8081` (main) and `9090` (additional). External traffic to either flows through its respective sidecar (the main `http` sidecar for `8081`, the `http-9090` sidecar for `9090`). + +**Constraint:** because the sidecar uses `port + 10000`, the additional port must be `≤ 55535` for HTTP. Above that the offset overflows the 65535 max TCP port. + +### GRPC additional port — sidecar terminates protocol + +The application does **NOT** bind GRPC additional ports. The `grpc-{port}` sidecar binds `{port}` directly and translates gRPC into HTTP, proxying to `localhost:main_http_port`. The application speaks only HTTP on `main_http_port` and serves both main HTTP traffic and any incoming gRPC requests (received already translated to HTTP). + +### Summary | | HTTP additional port | GRPC additional port | |---|---|---| -| App binds the port | no (sidecar binds it) | no (sidecar binds it) | -| Sidecar created | yes (`http-{port}` traffic-manager) | yes (`grpc-{port}` traffic-manager) | -| Service `targetPort` | `{port}` (the sidecar) | `{port}` (the sidecar) | -| Sidecar `UPSTREAM_PORT` | `main_http_port` | `main_http_port` (default in image) | -| Protocol translation | none (HTTP→HTTP) | gRPC → HTTP | - -If your application code currently binds an additional port directly (e.g., `app.listen(9090)`), remove that listener — nullplatform's sidecar handles the external binding. Your app will receive requests for the additional port on its `main_http_port` listener. +| App binds the port | yes, directly | no (sidecar binds it) | +| Sidecar internal port | `port + 10000` | `port` | +| Service `port` (external) | `port` | `port` | +| Service `targetPort` | `port + 10000` (sidecar) | `port` (sidecar) | +| Sidecar `UPSTREAM_PORT` | `port` (the app's same port) | `main_http_port` (default in image) | +| Protocol translation | none | gRPC → HTTP | +| Max valid `port` | 55535 | 65535 | ## Backward Compatibility From 9e6b43da9acffa06232ac569d1c5e903ed59fec8 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 8 May 2026 00:34:16 -0300 Subject: [PATCH 25/56] fix(k8s/verify_ingress): dedupe weights for multi-ingress listeners + tests --- .../tests/verify_ingress_reconciliation.bats | 78 ++++++++++++++++++- k8s/deployment/verify_ingress_reconciliation | 10 ++- 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/k8s/deployment/tests/verify_ingress_reconciliation.bats b/k8s/deployment/tests/verify_ingress_reconciliation.bats index 1e216f96..7371dda6 100644 --- a/k8s/deployment/tests/verify_ingress_reconciliation.bats +++ b/k8s/deployment/tests/verify_ingress_reconciliation.bats @@ -244,7 +244,7 @@ teardown() { assert_contains "$output" "📋 ALB validation enabled: k8s-test-alb for domain app.example.com" assert_contains "$output" "📝 Checking domain: app.example.com" assert_contains "$output" "✅ Found rule for domain: app.example.com" - assert_contains "$output" "❌ Weights mismatch on listener port 443: expected=50/50 actual=20/80" + assert_contains "$output" "❌ Weights mismatch on listener port 443: expected=50 actual=20/80" } @test "verify_ingress_reconciliation: skips weight check on additional port listener when blue has no service" { @@ -291,6 +291,82 @@ teardown() { assert_contains "$output" "✅ ALB configuration validated successfully" } +@test "verify_ingress_reconciliation: passes when multiple rules on same listener share expected weights (CLIEN-739)" { + # Scenario: scope has main + additional HTTP port ingresses sharing the ALB listener. + # Both rules match the same host-header and each carries blue/green target groups with + # the same blue-green split (90/10). The pre-dedupe extractor returned "10/10/90/90" + # and falsely failed against expected "10/90". Dedupe makes the comparison correct. + local ctx='{"scope":{"slug":"my-app","domain":"app.example.com","current_active_deployment":"deploy-old"},"alb_name":"k8s-test-alb","deployment":{"strategy":"blue_green","strategy_data":{"desired_switched_traffic":10}}}' + + run bash -c " + kubectl() { + echo '{\"metadata\": {\"resourceVersion\": \"12345\"}}' + return 0 + } + aws() { + case \"\$2\" in + describe-load-balancers) + echo 'arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/test-alb/abc123' + ;; + describe-listeners) + echo '{\"Listeners\":[{\"ListenerArn\":\"arn:aws:listener/443\",\"Port\":443}]}' + ;; + describe-rules) + echo '{\"Rules\":[{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":90},{\"Weight\":10}]}}]},{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":90},{\"Weight\":10}]}}]}]}' + ;; + esac + return 0 + } + export -f kubectl aws + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export MAX_WAIT_SECONDS='1' CHECK_INTERVAL='1' + export ALB_RECONCILIATION_ENABLED='true' VERIFY_WEIGHTS='true' REGION='$REGION' + export CONTEXT='$ctx' + source '$BATS_TEST_DIRNAME/../verify_ingress_reconciliation' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "✅ Weights match on listener port 443" + assert_contains "$output" "✅ ALB configuration validated successfully" +} + +@test "verify_ingress_reconciliation: detects mismatch when one rule diverges from expected (CLIEN-739)" { + # Scenario: main rule has correct 90/10 split, additional port rule has wrong 50/50 split. + # After dedupe the unique values become 10/50/90, which does not match expected 10/90. + # Confirms that dedupe still surfaces real misconfigurations across multiple rules. + local ctx='{"scope":{"slug":"my-app","domain":"app.example.com","current_active_deployment":"deploy-old"},"alb_name":"k8s-test-alb","deployment":{"strategy":"blue_green","strategy_data":{"desired_switched_traffic":10}}}' + + run bash -c " + kubectl() { + echo '{\"metadata\": {\"resourceVersion\": \"12345\"}}' + return 0 + } + aws() { + case \"\$2\" in + describe-load-balancers) + echo 'arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/test-alb/abc123' + ;; + describe-listeners) + echo '{\"Listeners\":[{\"ListenerArn\":\"arn:aws:listener/443\",\"Port\":443}]}' + ;; + describe-rules) + echo '{\"Rules\":[{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":90},{\"Weight\":10}]}}]},{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":50},{\"Weight\":50}]}}]}]}' + ;; + esac + return 0 + } + export -f kubectl aws + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export MAX_WAIT_SECONDS='1' CHECK_INTERVAL='1' + export ALB_RECONCILIATION_ENABLED='true' VERIFY_WEIGHTS='true' REGION='$REGION' + export CONTEXT='$ctx' + source '$BATS_TEST_DIRNAME/../verify_ingress_reconciliation' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "❌ Weights mismatch on listener port 443: expected=10/90 actual=10/50/90" +} + @test "verify_ingress_reconciliation: detects domain not found in ALB rules" { run bash -c " kubectl() { diff --git a/k8s/deployment/verify_ingress_reconciliation b/k8s/deployment/verify_ingress_reconciliation index ee9f3221..12814400 100644 --- a/k8s/deployment/verify_ingress_reconciliation +++ b/k8s/deployment/verify_ingress_reconciliation @@ -156,8 +156,14 @@ validate_alb_config() { GREEN_WEIGHT=$SWITCH_TRAFFIC BLUE_DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.scope.current_active_deployment // empty') + # Dedupe: when a scope has multiple ingresses on the same ALB listener + # (main + additional HTTP ports), the host-header select returns multiple + # rules and the weight extraction concatenates pairs (e.g. 10/10/90/90). + # We compare by the set of unique weights — false negatives if pairs are + # all consistent with expected; mismatches still surface (extra unique + # values appear as soon as any rule's weights diverge). if [ -n "$BLUE_DEPLOYMENT_ID" ]; then - EXPECTED_WEIGHTS=$(printf "%s\n%s" "$BLUE_WEIGHT" "$GREEN_WEIGHT" | sort -n) + EXPECTED_WEIGHTS=$(printf "%s\n%s" "$BLUE_WEIGHT" "$GREEN_WEIGHT" | sort -un) else EXPECTED_WEIGHTS="$GREEN_WEIGHT" fi @@ -167,7 +173,7 @@ validate_alb_config() { select(.Type == "forward") | .ForwardConfig.TargetGroups[]? | "\(.Weight // 1)" - ' 2>/dev/null | sort -n) + ' 2>/dev/null | sort -un) if [ -n "$EXPECTED_WEIGHTS" ] && [ -n "$ACTUAL_WEIGHTS" ]; then if [ "$EXPECTED_WEIGHTS" == "$ACTUAL_WEIGHTS" ]; then From 1a52f3c5140829f443d26be0a6325d528294af2d Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 8 May 2026 11:48:13 -0300 Subject: [PATCH 26/56] feat(k8s/templates): HTTP additional ports listen on dedicated HTTPS port --- .../templates/blue-green-ingress.yaml.tpl | 3 +- .../templates/initial-ingress.yaml.tpl | 2 +- .../tests/ingress_template_shape.bats | 77 +++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 k8s/deployment/tests/ingress_template_shape.bats diff --git a/k8s/deployment/templates/blue-green-ingress.yaml.tpl b/k8s/deployment/templates/blue-green-ingress.yaml.tpl index 18504151..c35f53fe 100644 --- a/k8s/deployment/templates/blue-green-ingress.yaml.tpl +++ b/k8s/deployment/templates/blue-green-ingress.yaml.tpl @@ -145,8 +145,7 @@ metadata: alb.ingress.kubernetes.io/target-node-labels: account={{ $.account.slug }},namespace={{ $.namespace.slug }},application={{ $.application.slug }},account_id={{ $.account.id }},namespace_id={{ $.namespace.id }},application_id={{ $.application.id }},scope={{ $.scope.slug }},scope_id={{ $.scope.id }},nullplatform=true alb.ingress.kubernetes.io/target-type: ip {{ if eq .type "HTTP" }} - alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80},{"HTTPS":443}]' - alb.ingress.kubernetes.io/ssl-redirect: "443" + alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' {{ else if eq .type "GRPC" }} alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' alb.ingress.kubernetes.io/backend-protocol-version: GRPC diff --git a/k8s/deployment/templates/initial-ingress.yaml.tpl b/k8s/deployment/templates/initial-ingress.yaml.tpl index d68e7e36..088a1eaf 100644 --- a/k8s/deployment/templates/initial-ingress.yaml.tpl +++ b/k8s/deployment/templates/initial-ingress.yaml.tpl @@ -110,7 +110,7 @@ metadata: annotations: alb.ingress.kubernetes.io/group.name: {{ $.alb_name }} {{ if eq .type "HTTP" }} - alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80},{"HTTPS":443}]' + alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' {{ else if eq .type "GRPC" }} alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' alb.ingress.kubernetes.io/backend-protocol-version: GRPC diff --git a/k8s/deployment/tests/ingress_template_shape.bats b/k8s/deployment/tests/ingress_template_shape.bats new file mode 100644 index 00000000..9f4ea7ed --- /dev/null +++ b/k8s/deployment/tests/ingress_template_shape.bats @@ -0,0 +1,77 @@ +#!/usr/bin/env bats +# ============================================================================= +# Structural tests for the ingress templates. +# Verifies the listen-ports annotation shape per port type without rendering +# templates. Catches regressions like accidentally restoring a hardcoded +# [{"HTTP":80},{"HTTPS":443}] for HTTP additional ports (which would re-shadow +# the main ingress on the same listener). +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + export INITIAL="$PROJECT_ROOT/k8s/deployment/templates/initial-ingress.yaml.tpl" + export BLUE_GREEN="$PROJECT_ROOT/k8s/deployment/templates/blue-green-ingress.yaml.tpl" +} + +# ----------------------------------------------------------------------------- +# Main ingress (the top-level ingress, NOT inside additional_ports loop) +# ----------------------------------------------------------------------------- + +@test "initial-ingress: main ingress listens on HTTP:80 + HTTPS:443" { + # First listen-ports occurrence in the file is the main ingress. + first_listen=$(grep -m 1 "listen-ports" "$INITIAL") + [[ "$first_listen" == *'[{"HTTP":80},{"HTTPS":443}]'* ]] +} + +@test "blue-green-ingress: main ingress listens on HTTP:80 + HTTPS:443 with ssl-redirect" { + first_listen=$(grep -m 1 "listen-ports" "$BLUE_GREEN") + [[ "$first_listen" == *'[{"HTTP":80},{"HTTPS":443}]'* ]] + # ssl-redirect is on the main ingress (only HTTP+HTTPS listeners use it). + grep -q 'ssl-redirect: "443"' "$BLUE_GREEN" +} + +# ----------------------------------------------------------------------------- +# Additional ports — both HTTP and GRPC use HTTPS on their own port +# (CLIEN-739: HTTP additional ports moved from sharing listener 443 to +# opening their own HTTPS listener at .port, matching the GRPC pattern.) +# ----------------------------------------------------------------------------- + +@test "initial-ingress: HTTP additional port branch uses per-port HTTPS listener" { + # Inside the additional_ports loop, the HTTP branch must use [{"HTTPS":{{ .port }}}]. + # The string '[{"HTTPS":{{ .port }}}]' must appear in the file. The string + # '"HTTP":80' must NOT appear inside the additional_ports range — only on + # the main ingress (which is outside the range). + grep -F '[{"HTTPS":{{ .port }}}]' "$INITIAL" | head -1 >/dev/null + # Sanity: there should be exactly two occurrences of [{"HTTPS":{{ .port }}}] + # (one for HTTP branch, one for GRPC branch). + count=$(grep -cF '[{"HTTPS":{{ .port }}}]' "$INITIAL") + [ "$count" -eq 2 ] + # Sanity: there should be exactly one occurrence of [{"HTTP":80},{"HTTPS":443}] + # (the main ingress only — additional ports must not use it). + shared_count=$(grep -cF '[{"HTTP":80},{"HTTPS":443}]' "$INITIAL") + [ "$shared_count" -eq 1 ] +} + +@test "initial-ingress: GRPC additional port uses backend-protocol-version GRPC" { + grep -q 'backend-protocol-version: GRPC' "$INITIAL" +} + +@test "blue-green-ingress: HTTP additional port branch uses per-port HTTPS listener" { + count=$(grep -cF '[{"HTTPS":{{ .port }}}]' "$BLUE_GREEN") + [ "$count" -eq 2 ] + shared_count=$(grep -cF '[{"HTTP":80},{"HTTPS":443}]' "$BLUE_GREEN") + [ "$shared_count" -eq 1 ] +} + +@test "blue-green-ingress: ssl-redirect only present on main ingress (one occurrence)" { + # ssl-redirect: "443" only makes sense when the listener has both HTTP and HTTPS, + # which is the main ingress. Additional HTTP ports use HTTPS-only listeners, + # so they must not carry ssl-redirect. + count=$(grep -cF 'ssl-redirect: "443"' "$BLUE_GREEN") + [ "$count" -eq 1 ] +} + +@test "blue-green-ingress: GRPC additional port uses backend-protocol-version GRPC" { + grep -q 'backend-protocol-version: GRPC' "$BLUE_GREEN" +} From 604a1feff98e19e1c1581d7a18bd38f9b0db7701 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 8 May 2026 11:54:46 -0300 Subject: [PATCH 27/56] Fix tests --- k8s/kubectl_get | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/kubectl_get b/k8s/kubectl_get index 7f55e687..8a05f061 100755 --- a/k8s/kubectl_get +++ b/k8s/kubectl_get @@ -72,7 +72,7 @@ is_blocked() { involves_secrets() { local arg lower token res for arg in "$@"; do - lower="${arg,,}" + lower=$(echo "$arg" | tr '[:upper:]' '[:lower:]') local IFS=, for token in $lower; do res="${token%%/*}" From 1e13f95be6c34f4200be5623fc0eaeddfbfe8256 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Mon, 11 May 2026 11:54:02 -0300 Subject: [PATCH 28/56] docs: document ALB listener lifecycle and 50-listener capacity limit --- .../validate_alb_target_group_capacity.bats | 232 +++++++++++++++++- .../validate_alb_target_group_capacity | 72 ++++++ k8s/docs/configurable-http-ports.md | 41 ++++ k8s/values.yaml | 1 + 4 files changed, 345 insertions(+), 1 deletion(-) diff --git a/k8s/deployment/tests/validate_alb_target_group_capacity.bats b/k8s/deployment/tests/validate_alb_target_group_capacity.bats index 08d1f28c..3ecd2e89 100644 --- a/k8s/deployment/tests/validate_alb_target_group_capacity.bats +++ b/k8s/deployment/tests/validate_alb_target_group_capacity.bats @@ -15,6 +15,7 @@ setup() { export ALB_NAME="k8s-nullplatform-internet-facing" export REGION="us-east-1" export ALB_MAX_TARGET_GROUPS="98" + export ALB_MAX_LISTENERS="48" export DNS_TYPE="route53" # Base CONTEXT @@ -22,7 +23,7 @@ setup() { "providers": {} }' - # Mock aws - default: ALB with 40 target groups + # Mock aws - default: ALB with 40 target groups and 10 listeners aws() { case "$*" in *"describe-load-balancers"*) @@ -33,6 +34,10 @@ setup() { echo "40" return 0 ;; + *"describe-listeners"*) + echo "10" + return 0 + ;; esac } export -f aws @@ -258,6 +263,10 @@ teardown() { echo "0" return 0 ;; + *"describe-listeners"*) + echo "10" + return 0 + ;; esac } export -f aws @@ -280,6 +289,10 @@ teardown() { echo "97" return 0 ;; + *"describe-listeners"*) + echo "10" + return 0 + ;; esac } export -f aws @@ -382,3 +395,220 @@ teardown() { assert_equal "$status" "0" assert_contains "$output" "🔍 Validating ALB target group capacity for 'k8s-nullplatform-internet-facing'..." } + +# ============================================================================= +# Listener capacity (CLIEN-739) +# ============================================================================= +@test "validate_alb_target_group_capacity: success message includes listener capacity" { + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 10 listeners (max: 48)" + assert_contains "$output" "✅ ALB listener capacity validated: 10/48" +} + +@test "validate_alb_target_group_capacity: fails when listener count is at capacity" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "48" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 48/48" + assert_contains "$output" "💡 Possible causes:" + assert_contains "$output" "Too many scopes with additional_ports are attached to this ALB" + assert_contains "$output" "🔧 How to fix:" + assert_contains "$output" "Reduce additional_ports across scopes sharing this ALB" + assert_contains "$output" "Increase ALB_MAX_LISTENERS in values.yaml or scope-configurations provider (AWS limit is 50)" + assert_contains "$output" "Request an AWS service quota increase for listeners per ALB" +} + +@test "validate_alb_target_group_capacity: fails when listener count is over capacity" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "50" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 50/48" +} + +@test "validate_alb_target_group_capacity: passes at exactly one below listener capacity" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "47" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "✅ ALB listener capacity validated: 47/48" +} + +@test "validate_alb_target_group_capacity: handles zero listeners" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "0" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 0 listeners (max: 48)" + assert_contains "$output" "✅ ALB listener capacity validated: 0/48" +} + +@test "validate_alb_target_group_capacity: uses default ALB_MAX_LISTENERS of 48" { + unset ALB_MAX_LISTENERS + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 10 listeners (max: 48)" +} + +@test "validate_alb_target_group_capacity: ALB_MAX_LISTENERS from env var" { + export ALB_MAX_LISTENERS="5" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 10/5" +} + +@test "validate_alb_target_group_capacity: ALB_MAX_LISTENERS from scope-configurations provider" { + export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_listeners":"5"}}}}' + export ALB_MAX_LISTENERS="48" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 10/5" +} + +@test "validate_alb_target_group_capacity: ALB_MAX_LISTENERS from container-orchestration provider" { + export CONTEXT='{"providers":{"container-orchestration":{"balancer":{"alb_max_listeners":"5"}}}}' + export ALB_MAX_LISTENERS="48" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 10/5" +} + +@test "validate_alb_target_group_capacity: fails when describe-listeners fails" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "Access Denied" >&2 + return 1 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ Failed to describe listeners for ALB 'k8s-nullplatform-internet-facing'" + assert_contains "$output" "Check IAM permissions for elbv2:DescribeListeners" +} + +@test "validate_alb_target_group_capacity: fails when listener count is non-numeric" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "WARNING: unexpected" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ Unexpected non-numeric listener count from ALB" + assert_contains "$output" "📋 Received value: WARNING: unexpected" +} + +@test "validate_alb_target_group_capacity: fails when ALB_MAX_LISTENERS is non-numeric" { + export ALB_MAX_LISTENERS="abc" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB_MAX_LISTENERS must be a numeric value, got: 'abc'" +} diff --git a/k8s/deployment/validate_alb_target_group_capacity b/k8s/deployment/validate_alb_target_group_capacity index 9b3fc8de..71d01d9e 100755 --- a/k8s/deployment/validate_alb_target_group_capacity +++ b/k8s/deployment/validate_alb_target_group_capacity @@ -112,3 +112,75 @@ if [[ "$TARGET_GROUP_COUNT" -ge "$ALB_MAX_TARGET_GROUPS" ]]; then fi log info "✅ ALB target group capacity validated: $TARGET_GROUP_COUNT/$ALB_MAX_TARGET_GROUPS" + +# Listener capacity validation (CLIEN-739): each additional_port HTTP/GRPC +# opens its own ALB listener, so this constraint can hit before the target +# group quota in scopes with many additional ports. +ALB_MAX_LISTENERS=$(get_config_value \ + --env ALB_MAX_LISTENERS \ + --provider '.providers["scope-configurations"].networking.alb_max_listeners' \ + --provider '.providers["container-orchestration"].balancer.alb_max_listeners' \ + --default "48" +) + +if ! [[ "$ALB_MAX_LISTENERS" =~ ^[0-9]+$ ]]; then + log error "❌ ALB_MAX_LISTENERS must be a numeric value, got: '$ALB_MAX_LISTENERS'" + log error "" + log error "🔧 How to fix:" + log error " • Set a numeric value in values.yaml or scope-configurations provider" + log error "" + exit 1 +fi + +log debug "📋 ALB: $ALB_NAME | Max listeners: $ALB_MAX_LISTENERS" + +LISTENER_COUNT=$(aws elbv2 describe-listeners \ + --load-balancer-arn "$ALB_ARN" \ + --region "$REGION" \ + --query 'length(Listeners)' \ + --output text \ + --no-paginate 2>&1) || { + log error "❌ Failed to describe listeners for ALB '$ALB_NAME'" + log error "" + log error "💡 Possible causes:" + log error " The agent may lack permissions to describe listeners" + log error "" + log error "🔧 How to fix:" + log error " • Check IAM permissions for elbv2:DescribeListeners" + log error "" + exit 1 +} + +if ! [[ "$LISTENER_COUNT" =~ ^[0-9]+$ ]]; then + log error "❌ Unexpected non-numeric listener count from ALB" + log error "📋 ALB ARN: $ALB_ARN" + log error "📋 Received value: $LISTENER_COUNT" + log error "" + log error "💡 Possible causes:" + log error " The AWS CLI returned an unexpected response format" + log error "" + log error "🔧 How to fix:" + log error " • Verify AWS CLI version and credentials are correct" + log error " • Run manually: aws elbv2 describe-listeners --load-balancer-arn $ALB_ARN --region $REGION --query 'length(Listeners)'" + log error "" + exit 1 +fi + +log info "📋 ALB '$ALB_NAME' has $LISTENER_COUNT listeners (max: $ALB_MAX_LISTENERS)" + +if [[ "$LISTENER_COUNT" -ge "$ALB_MAX_LISTENERS" ]]; then + log error "❌ ALB '$ALB_NAME' has reached listener capacity: $LISTENER_COUNT/$ALB_MAX_LISTENERS" + log error "" + log error "💡 Possible causes:" + log error " Too many scopes with additional_ports are attached to this ALB. Each HTTP/GRPC additional port opens its own listener." + log error "" + log error "🔧 How to fix:" + log error " • Reduce additional_ports across scopes sharing this ALB" + log error " • Increase ALB_MAX_LISTENERS in values.yaml or scope-configurations provider (AWS limit is 50)" + log error " • Request an AWS service quota increase for listeners per ALB" + log error " • Consider using a separate ALB for additional scopes" + log error "" + exit 1 +fi + +log info "✅ ALB listener capacity validated: $LISTENER_COUNT/$ALB_MAX_LISTENERS" diff --git a/k8s/docs/configurable-http-ports.md b/k8s/docs/configurable-http-ports.md index c09b9e98..a3a0b2f8 100644 --- a/k8s/docs/configurable-http-ports.md +++ b/k8s/docs/configurable-http-ports.md @@ -75,6 +75,44 @@ The application does **NOT** bind GRPC additional ports. The `grpc-{port}` sidec | Protocol translation | none | gRPC → HTTP | | Max valid `port` | 55535 | 65535 | +## ALB capacity and listener lifecycle + +### Each additional port opens its own ALB listener + +The Ingress generated for each additional port (HTTP or GRPC) declares `alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{port}}]'`. This means **every additional port translates into a dedicated listener on the shared ALB** (`spec.ports[].port == {scope additional port}`). The main scope ingress keeps its standard `[{"HTTP":80},{"HTTPS":443}]` listener pair. + +Concrete example for an ALB shared by three scopes, each with `main_http_port=8081` plus one HTTP additional port `9090`, `9091`, and `9092` respectively: + +| ALB listener | Source ingress | Backend | +|---|---|---| +| `:80` | All scopes (main) | Main sidecar `http` | +| `:443` | All scopes (main) | Main sidecar `http` | +| `:9090` | scope A `http-9090` ingress | Sidecar `http-9090` of scope A | +| `:9091` | scope B `http-9091` ingress | Sidecar `http-9091` of scope B | +| `:9092` | scope C `http-9092` ingress | Sidecar `http-9092` of scope C | + +The main listeners (80/443) are shared across all scopes via the IngressGroup; one listener serves many ingress rules (one per scope host). Additional ports are NOT shared by default — each port is a separate listener. + +### AWS limit: 50 listeners per ALB + +This is an AWS hard quota. With many scopes using additional ports on the same ALB, the listener count climbs quickly: each scope adds 1 listener per HTTP/GRPC additional port. A pre-flight check in `k8s/deployment/validate_alb_target_group_capacity` rejects deployments when the ALB would exceed `ALB_MAX_LISTENERS` (default `48`, leaves 2 slots of headroom before the AWS limit). The threshold is configurable in `values.yaml` or via the `scope-configurations`/`container-orchestration` provider. + +If a deployment fails with `❌ ALB 'NAME' has reached listener capacity: X/48`, the operator options are: +- Reduce `additional_ports` across the scopes sharing the ALB +- Increase `ALB_MAX_LISTENERS` (only safe up to 49 — at 50 the next deploy will hit the AWS quota itself) +- Request an AWS service-quota increase for listeners per ALB (the limit is technically adjustable, although AWS tends to deny large increases) +- Move some scopes to a separate ALB (the recommended path) + +### Listeners are cleaned up automatically + +Operators do not need to manage ALB listeners by hand. The AWS Load Balancer Controller owns listener lifecycle through the IngressGroup mechanism: + +- When the **first** Ingress with `alb.ingress.kubernetes.io/listen-ports` referencing a given port is created, the controller adds that listener to the shared ALB. +- When the **last** Ingress referencing that port is deleted, the controller removes the listener. +- In between, multiple Ingresses on the same port coexist as different rules on a single listener; the controller never duplicates the listener itself. + +This means deleting a deployment (which deletes its Ingresses) is sufficient to reclaim listener capacity — no manual cleanup of the ALB is required. If a scope is the only consumer of a particular additional port across the ALB, deleting that scope returns the listener to the pool and frees an `ALB_MAX_LISTENERS` slot for the next deployment. + ## Backward Compatibility - Existing scopes that do not set `main_http_port` get `8080` automatically via the JSON Schema default and the `// 8080` jq fallback in `build_context`. No migration is required. @@ -92,3 +130,6 @@ The application does **NOT** bind GRPC additional ports. The `grpc-{port}` sidec ## Tests - `k8s/deployment/tests/build_context.bats` covers `main_http_port` extraction with present, absent, and `null` cases, and verifies the `tonumber` cast. +- `k8s/deployment/tests/ingress_template_shape.bats` verifies the per-port HTTPS listener annotation on each ingress branch and pins the absence of `ssl-redirect` on additional-port ingresses. +- `k8s/deployment/tests/verify_ingress_reconciliation.bats` covers the weight-dedupe behavior introduced because a shared ALB listener used to surface multiple matching rules (the multi-rule scenario is no longer reachable now that each additional port has its own listener, but the dedupe is kept defensively). +- `k8s/deployment/tests/validate_alb_target_group_capacity.bats` covers both target-group capacity and the listener-capacity validation (`ALB_MAX_LISTENERS`). diff --git a/k8s/values.yaml b/k8s/values.yaml index 020b6059..ec9042fa 100644 --- a/k8s/values.yaml +++ b/k8s/values.yaml @@ -12,6 +12,7 @@ configuration: ALB_RECONCILIATION_ENABLED: false ALB_MAX_CAPACITY: 75 ALB_MAX_TARGET_GROUPS: 98 + ALB_MAX_LISTENERS: 48 ALB_METRICS_PUBLISH_ENABLED: false # ALB_METRICS_PUBLISH_TARGET: cloudwatch # Available values: cloudwatch | datadog DEPLOYMENT_MAX_WAIT_IN_SECONDS: 600 From 296c410946a777b9f380598204452bb82a7b5d05 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Mon, 11 May 2026 14:55:40 -0300 Subject: [PATCH 29/56] Review changes --- k8s/deployment/build_context | 14 ++- k8s/deployment/templates/deployment.yaml.tpl | 10 +- k8s/deployment/templates/service.yaml.tpl | 2 +- k8s/deployment/tests/build_context.bats | 118 +++++++++++++++++++ k8s/values.yaml | 4 + 5 files changed, 141 insertions(+), 7 deletions(-) diff --git a/k8s/deployment/build_context b/k8s/deployment/build_context index e15054c3..1f357deb 100755 --- a/k8s/deployment/build_context +++ b/k8s/deployment/build_context @@ -245,10 +245,22 @@ if [[ -n "$TRAFFIC_MANAGER_CONFIG_MAP" ]]; then log info "✨ ConfigMap '$TRAFFIC_MANAGER_CONFIG_MAP' validation successful" fi -# Main HTTP port — defaults to 8080 if not set on the scope (CLIEN-739) MAIN_HTTP_PORT=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') log debug "🔍 main_http_port resolved to ${MAIN_HTTP_PORT}" +# Enrich each additional_ports entry with traffic_manager_port = port + 10000. +# Convention: the traffic-manager sidecar that fronts an additional port binds +# +10000 inside the pod so the application can bind directly. The +# fixed +10000 offset makes it trivial to identify which sidecar belongs to +# which application port at a glance (e.g. app 8081 -> sidecar 18081). Keeping +# the math here (instead of in every template) means consumers just read +# .traffic_manager_port and never re-derive it. +CONTEXT=$(echo "$CONTEXT" | jq ' + if (.scope.capabilities.additional_ports | type) == "array" then + .scope.capabilities.additional_ports |= map(. + {traffic_manager_port: (.port + 10000)}) + else . end +') + # Check if blue deployment has K8s services for additional ports BLUE_ADDITIONAL_PORT_SERVICES="{}" if [ -n "$BLUE_DEPLOYMENT_ID" ] && [ "$BLUE_DEPLOYMENT_ID" != "null" ]; then diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 5e1f6368..3552c483 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -237,7 +237,7 @@ spec: runAsUser: 0 image: {{ $.traffic_image }} ports: - - containerPort: {{ add .port 10000 }} + - containerPort: {{ .traffic_manager_port }} protocol: TCP env: - name: UPSTREAM_PORT @@ -249,7 +249,7 @@ spec: - name: LISTENER_PROTOCOL value: http - name: LISTENER_PORT - value: '{{ add .port 10000 }}' + value: '{{ .traffic_manager_port }}' - name: HEALTH_CHECK_PATH value: {{ $.scope.capabilities.health_check.path }} resources: @@ -261,7 +261,7 @@ spec: livenessProbe: httpGet: path: {{ $.scope.capabilities.health_check.path }} - port: {{ add .port 10000 }} + port: {{ .traffic_manager_port }} timeoutSeconds: 5 periodSeconds: 10 initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} @@ -270,7 +270,7 @@ spec: readinessProbe: httpGet: path: {{ $.scope.capabilities.health_check.path }} - port: {{ add .port 10000 }} + port: {{ .traffic_manager_port }} timeoutSeconds: 5 periodSeconds: 10 initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} @@ -279,7 +279,7 @@ spec: startupProbe: httpGet: path: {{ $.scope.capabilities.health_check.path }} - port: {{ add .port 10000 }} + port: {{ .traffic_manager_port }} timeoutSeconds: 5 periodSeconds: 10 initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} diff --git a/k8s/deployment/templates/service.yaml.tpl b/k8s/deployment/templates/service.yaml.tpl index 0bb36b44..ab74d346 100644 --- a/k8s/deployment/templates/service.yaml.tpl +++ b/k8s/deployment/templates/service.yaml.tpl @@ -142,7 +142,7 @@ spec: ports: - protocol: TCP port: {{ .port }} - targetPort: {{ add .port 10000 }} + targetPort: {{ .traffic_manager_port }} selector: nullplatform: "true" account: {{ $.account.slug }} diff --git a/k8s/deployment/tests/build_context.bats b/k8s/deployment/tests/build_context.bats index dfd57700..690a8ab4 100644 --- a/k8s/deployment/tests/build_context.bats +++ b/k8s/deployment/tests/build_context.bats @@ -828,3 +828,121 @@ SCRIPT result=$(echo '{}' | jq --arg main_http_port "$MAIN_HTTP_PORT" '. + {main_http_port: ($main_http_port | tonumber)} | .main_http_port') assert_equal "$result" "9090" } + +# ============================================================================= +# additional_ports enrichment: traffic_manager_port = port + 10000 +# These tests source the real deployment/build_context and assert on the +# resulting CONTEXT, so the entire pipeline (scope/build_context -> deployment +# enrichment) is exercised. +# ============================================================================= + +# Stages the full environment needed to source deployment/build_context: +# external commands (kubectl, aws) mocked, required env vars set, and CONTEXT +# pre-populated with a deployment that satisfies validate_status. The caller +# patches CONTEXT.scope.capabilities.additional_ports for the case under test. +setup_full_build_context() { + export SERVICE_PATH="$PROJECT_ROOT/k8s" + export SCRIPT="$PROJECT_ROOT/k8s/deployment/build_context" + export NP_OUTPUT_DIR="$(mktemp -d)" + export SERVICE_ACTION="start-initial" + # Skip the route53 / additional-balancer code paths that would call `aws`. + export DNS_TYPE="external_dns" + + kubectl() { + case "$1 $2" in + "get namespace") return 0 ;; + "get service") return 1 ;; # no blue services -> empty map, harmless + *) return 0 ;; + esac + } + export -f kubectl + + export CONTEXT='{ + "scope": { + "id": "test-scope-123", + "nrn": "nrn:organization=100:account=200:namespace=300:application=400", + "domain": "test.nullapps.io", + "capabilities": { + "visibility": "public", + "scaling_type": "fixed", + "fixed_instances": 2, + "protocol": "http" + } + }, + "namespace": {"slug": "test-namespace"}, + "application": {"slug": "test-app"}, + "deployment": {"id": "deploy-123", "status": "creating"}, + "providers": { + "cloud-providers": {"account": {"region": "us-east-1"}}, + "container-orchestration": { + "cluster": {"namespace": "default-namespace"}, + "gateway": {"public_name": "gw-pub", "private_name": "gw-priv"}, + "balancer": {"public_name": "alb-pub", "private_name": "alb-priv"} + } + } + }' +} + +# Patches CONTEXT.scope.capabilities.additional_ports with the given JSON +# fragment (raw jq value, e.g. '[{"port":8081,"type":"HTTP"}]' or 'null'). +set_additional_ports() { + CONTEXT=$(echo "$CONTEXT" | jq --argjson v "$1" '.scope.capabilities.additional_ports = $v') +} + +@test "traffic_manager_port: derived as port + 10000 for every additional_ports entry" { + setup_full_build_context + set_additional_ports '[{"port":8081,"type":"HTTP"},{"port":9014,"type":"GRPC"}]' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].traffic_manager_port')" "18081" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[1].traffic_manager_port')" "19014" +} + +@test "traffic_manager_port: preserves original port and type fields" { + setup_full_build_context + set_additional_ports '[{"port":8081,"type":"HTTP"}]' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].port')" "8081" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].type')" "HTTP" +} + +@test "traffic_manager_port: emitted as JSON number (not string) for Go template consumption" { + setup_full_build_context + set_additional_ports '[{"port":8081,"type":"HTTP"}]' + + source "$SCRIPT" + + local jq_type + jq_type=$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].traffic_manager_port | type') + assert_equal "$jq_type" "number" +} + +@test "traffic_manager_port: noop when additional_ports is absent" { + setup_full_build_context + CONTEXT=$(echo "$CONTEXT" | jq 'del(.scope.capabilities.additional_ports)') + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports')" "null" +} + +@test "traffic_manager_port: noop when additional_ports is null" { + setup_full_build_context + set_additional_ports 'null' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports')" "null" +} + +@test "traffic_manager_port: noop when additional_ports is empty array" { + setup_full_build_context + set_additional_ports '[]' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -c '.scope.capabilities.additional_ports')" "[]" +} diff --git a/k8s/values.yaml b/k8s/values.yaml index ec9042fa..d053bc0a 100644 --- a/k8s/values.yaml +++ b/k8s/values.yaml @@ -11,7 +11,11 @@ configuration: DNS_TYPE: route53 # Available values route53 | azure | external_dns ALB_RECONCILIATION_ENABLED: false ALB_MAX_CAPACITY: 75 + # 100 is the max target groups for ALB. Keeps 2 free for emergencies + # Ref: https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-limits.html ALB_MAX_TARGET_GROUPS: 98 + # 50 is the max listeners for ALB. Keeps 2 free for emergencies + # Ref: https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-limits.html ALB_MAX_LISTENERS: 48 ALB_METRICS_PUBLISH_ENABLED: false # ALB_METRICS_PUBLISH_TARGET: cloudwatch # Available values: cloudwatch | datadog From 849f97ce7f4269cfb61ed3f24e4b88364471c8cd Mon Sep 17 00:00:00 2001 From: Ignacio Boudgouste <73672747+ignacioboud@users.noreply.github.com> Date: Wed, 20 May 2026 15:52:27 -0300 Subject: [PATCH 30/56] Surface user-friendly failure reasons on deployment timeout (#184) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Surface user-friendly failure reasons on deployment timeout Inspect pod containerStatuses and aggregate Warning events to derive a specific reason (ImagePullBackOff, OOMKilled, FailedScheduling, etc.), then map it to a human message and a targeted suggested fix. The generic checklist now only shows when no specific reason can be pinpointed. Co-Authored-By: Claude Opus 4.7 (1M context) * chore: move code to print failed deployments * chore: clean wait deployment * Fix CONTEXT/ALL_EVENTS jq fallback escapes and expand hint test coverage The ${CONTEXT:-{\}} and ${ALL_EVENTS:-{...\}} defaults expanded to invalid JSON, so jq failed silently when those vars were unset and hint messages came out malformed. The OOMKilled message also left "(Mi)" dangling when the scope had no ram_memory capability. Tests now cover all 10 FAILURE_REASON branches, assert wait_deployment_active invokes the hint script on timeout and non-running status, and guard against both regressions. Co-Authored-By: Claude Opus 4.7 (1M context) * Translate Unhealthy probe events and add waiting heartbeat - Add translate_probe_message helper that parses K8s probe failure messages into a human line (probe kind + path + failure mode: connection-refused / HTTP statuscode / timeout). - wait_deployment_active: source the helper and translate Unhealthy events in real time during the polling loop. Initialize LATEST_TIMESTAMP to now() so action retries do not reprint stale events. Emit an info-level heartbeat every 10% of TIMEOUT (clamped to >=1 iteration) so the operator sees progress when no new K8s events fire. - print_failed_deployment_hints: source the helper, extract the latest Unhealthy event message, enrich HUMAN_MESSAGE with the translated detail, and switch SUGGESTED_FIX based on failure mode (port not bound vs HTTP non-2xx vs probe timeout). - Tests: +8 for the helper, +4 for the hint enrichment (incl. latest event tiebreak), +5 for wait_deployment_active (heartbeat cadence, short-timeout clamp, success suppression, event-translation, stale-event filter). Full suite: 241/241 green. Co-Authored-By: Claude Opus 4.7 (1M context) * Consolidate Unhealthy events per pod and add raw-message fallback - translate_probe_message: add parse_probe_message (pipe-separated kind|path|mode for composition) and short_pod_name (strips the K8S_DEPLOYMENT_NAME- prefix and marks truncation with leading "..."). - wait_deployment_active: split the polling event loop into Unhealthy vs. other. For Unhealthy, group_by(pod) via jq and emit a single line per pod with every failure mode joined by ", ". Sentence form: "Startup probe failing on /health — not yet listening, responded HTTP 502 (expected 2xx)". - Defensive fallback: log_unhealthy_group returns non-zero if any message in the group cannot be parsed; the caller then invokes log_unhealthy_raw to emit one raw warning line per original message. Same shape in print_failed_deployment_hints — if translate_probe_message fails the hint surfaces the raw text. - Tests: +6 for parse_probe_message / short_pod_name, +1 for the consolidated single-line assertion, +1 for the raw-fallback path in wait_deployment_active, +1 for raw-fallback in the hint. Full suite: 250/250 green. Co-Authored-By: Claude Opus 4.7 (1M context) * Inline probe helpers into print_failed_deployment_hints Drop the standalone k8s/deployment/translate_probe_message helper file and move parse_probe_message, short_pod_name, and translate_probe_message to the top of print_failed_deployment_hints. The hint script now doubles as a sourceable library: when sourced with PRINT_HINTS_LIB_ONLY=true only the function definitions load, otherwise the diagnostic main also runs. - wait_deployment_active loads the helpers with PRINT_HINTS_LIB_ONLY=true at startup; the existing timeout/non-running sources stay unchanged so they still trigger the diagnostic main. - tests/translate_probe_message.bats sources the unified file with the same flag. - Full suite stays at 250/250 green. Co-Authored-By: Claude Opus 4.7 (1M context) * chore(changelog): note wait deployment active failure logging improvements Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + k8s/deployment/print_failed_deployment_hints | 327 ++++++++++++- .../tests/print_failed_deployment_hints.bats | 443 +++++++++++++++++- .../tests/translate_probe_message.bats | 142 ++++++ .../tests/wait_deployment_active.bats | 358 ++++++++++++++ k8s/deployment/wait_deployment_active | 103 +++- 6 files changed, 1341 insertions(+), 33 deletions(-) create mode 100644 k8s/deployment/tests/translate_probe_message.bats diff --git a/CHANGELOG.md b/CHANGELOG.md index 0299d06b..14165264 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage - Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports +- Improve **wait deployment active** failure logging: consolidate repeated `Unhealthy` probe events per pod into a single human-readable line, emit a progress heartbeat every 10% of timeout, and surface a targeted suggested fix based on the probe failure mode (port not open / HTTP non-2xx / probe timeout) ## [1.11.0] - 2026-04-16 - Add unit testing support diff --git a/k8s/deployment/print_failed_deployment_hints b/k8s/deployment/print_failed_deployment_hints index 66ce5d51..33b08ff8 100644 --- a/k8s/deployment/print_failed_deployment_hints +++ b/k8s/deployment/print_failed_deployment_hints @@ -1,23 +1,310 @@ #!/bin/bash +# This file doubles as a sourceable library: +# - Sourced with PRINT_HINTS_LIB_ONLY=true → only function definitions load +# (used by wait_deployment_active to access the probe helpers during polling). +# - Sourced or executed without that flag → also runs the diagnostic main +# below (used by wait_deployment_active in the timeout/non-running paths). -HEALTH_CHECK_PATH=$(echo "$CONTEXT" | jq -r .scope.capabilities.health_check.path) -REQUESTED_MEMORY=$(echo "$CONTEXT" | jq -r .scope.capabilities.ram_memory) -SCOPE_NAME=$(echo "$CONTEXT" | jq -r .scope.name) -SCOPE_DIMENSIONS=$(echo "$CONTEXT" | jq -r .scope.dimensions) - -log error "" -log error "⚠️ Application Startup Issue Detected" -log error "" -log error "💡 Possible causes:" -log error " Your application was unable to start within the expected timeframe" -log error "" -log error "🔧 How to fix:" -log error " 1. Port Configuration: Ensure your application listens on port 8080" -log error " 2. Health Check Endpoint: Verify your app responds to: $HEALTH_CHECK_PATH" -log error " 3. Application Logs: Review logs for startup errors (database connections," -log error " missing dependencies, or initialization errors)" -log error " 4. Memory Allocation: Current allocation is ${REQUESTED_MEMORY}Mi - increase if needed" -log error " 5. Environment Variables: Verify all required variables are configured in" -log error " parameters for scope '$SCOPE_NAME' or dimensions: $SCOPE_DIMENSIONS" -log error "" +# ----------------------------------------------------------------------------- +# Probe event helpers +# ----------------------------------------------------------------------------- + +# Outputs the probe failure components as pipe-separated fields: kind|path|mode. +# Pipe (not whitespace) is used because `read -r a b c` with IFS containing +# whitespace would collapse consecutive separators and swallow empty fields. +# Mode is one of: "not yet listening", "responded HTTP (expected 2xx)", +# "request timed out", or "failed". Returns non-zero on non-probe messages. +parse_probe_message() { + local msg="$1" + [[ -z "$msg" ]] && return 1 + + local probe_kind="" + if [[ "$msg" == *"Startup probe failed"* ]]; then + probe_kind="Startup" + elif [[ "$msg" == *"Liveness probe failed"* ]]; then + probe_kind="Liveness" + elif [[ "$msg" == *"Readiness probe failed"* ]]; then + probe_kind="Readiness" + else + return 1 + fi + + local probe_path="" + if [[ "$msg" =~ http://[0-9.]+:[0-9]+([^\"[:space:]]+) ]]; then + probe_path="${BASH_REMATCH[1]}" + fi + + local mode_text="" + if [[ "$msg" == *"connection refused"* ]]; then + mode_text="not yet listening" + elif [[ "$msg" =~ statuscode:[[:space:]]*([0-9]+) ]]; then + mode_text="responded HTTP ${BASH_REMATCH[1]} (expected 2xx)" + elif [[ "$msg" == *"context deadline exceeded"* || "$msg" == *"Client.Timeout"* || "$msg" == *"i/o timeout"* ]]; then + mode_text="request timed out" + else + mode_text="failed" + fi + + printf "%s|%s|%s\n" "$probe_kind" "$probe_path" "$mode_text" +} + +# Strips the well-known "d---" prefix from a pod name, leaving +# the replicaset hash and pod suffix (the parts that distinguish replicas). +# A leading "..." marks the truncation so the operator can tell the displayed +# name is a tail, not the real pod name. Falls back to the full name when the +# prefix does not match. +short_pod_name() { + local name="$1" + local prefix="${K8S_DEPLOYMENT_NAME:-}" + if [[ -n "$prefix" && "$name" == "$prefix"-* ]]; then + echo "...${name#${prefix}-}" + else + echo "$name" + fi +} + +# Translates a Kubernetes probe failure message into a single user-friendly +# line. Echoes the translation on stdout; returns non-zero if the input is not +# a probe message so callers can fall back to the raw text. +translate_probe_message() { + local msg="$1" + [[ -z "$msg" ]] && return 1 + + local probe_kind="" + if [[ "$msg" == *"Startup probe failed"* ]]; then + probe_kind="Startup" + elif [[ "$msg" == *"Liveness probe failed"* ]]; then + probe_kind="Liveness" + elif [[ "$msg" == *"Readiness probe failed"* ]]; then + probe_kind="Readiness" + else + return 1 + fi + + local probe_path="" + if [[ "$msg" =~ http://[0-9.]+:[0-9]+([^\"[:space:]]+) ]]; then + probe_path="${BASH_REMATCH[1]}" + fi + local path_suffix="" + [[ -n "$probe_path" ]] && path_suffix=" on $probe_path" + + if [[ "$msg" == *"connection refused"* ]]; then + echo "${probe_kind} probe — app is not yet listening${path_suffix}" + elif [[ "$msg" =~ statuscode:[[:space:]]*([0-9]+) ]]; then + echo "${probe_kind} probe — app responded with HTTP ${BASH_REMATCH[1]}${path_suffix} (expected 2xx)" + elif [[ "$msg" == *"context deadline exceeded"* || "$msg" == *"Client.Timeout"* || "$msg" == *"i/o timeout"* ]]; then + echo "${probe_kind} probe — request timed out${path_suffix}" + else + echo "${probe_kind} probe failed${path_suffix}" + fi +} + +# ----------------------------------------------------------------------------- +# Diagnostic state and functions +# ----------------------------------------------------------------------------- + +ALL_EVENTS="${ALL_EVENTS:-{\"items\":[]}}" + +FAILURE_REASON="" +FAILURE_MESSAGE="" +FAILURE_CONTAINER="" +FAILURE_EXIT_CODE="" +TOP_EVENT_REASONS="" +UNHEALTHY_MESSAGE="" +HUMAN_MESSAGE="" +SUGGESTED_FIX="" + +diagnose_failure() { + local pods_json="" + if [[ -n "${K8S_NAMESPACE:-}" && -n "${DEPLOYMENT_ID:-}" ]] && command -v kubectl >/dev/null 2>&1; then + pods_json=$(kubectl get pods -n "$K8S_NAMESPACE" -l "deployment_id=${DEPLOYMENT_ID}" -o json 2>/dev/null) + fi + + if [[ -n "$pods_json" ]] && echo "$pods_json" | jq -e . >/dev/null 2>&1; then + FAILURE_REASON=$(echo "$pods_json" | jq -r ' + [.items[].status.containerStatuses[]? + | (.state.waiting.reason // .lastState.terminated.reason // empty) + ] | map(select(. != "" and . != "Completed")) | + group_by(.) | max_by(length) | .[0] // empty' 2>/dev/null) + + if [[ -n "$FAILURE_REASON" ]]; then + FAILURE_MESSAGE=$(echo "$pods_json" | jq -r --arg r "$FAILURE_REASON" ' + [.items[].status.containerStatuses[]? + | select(.state.waiting.reason == $r or .lastState.terminated.reason == $r) + | (.state.waiting.message // .lastState.terminated.message // empty) + ] | map(select(. != "")) | .[0] // empty' 2>/dev/null) + + FAILURE_CONTAINER=$(echo "$pods_json" | jq -r --arg r "$FAILURE_REASON" ' + [.items[].status.containerStatuses[]? + | select(.state.waiting.reason == $r or .lastState.terminated.reason == $r) + | .name + ] | .[0] // empty' 2>/dev/null) + + FAILURE_EXIT_CODE=$(echo "$pods_json" | jq -r --arg r "$FAILURE_REASON" ' + [.items[].status.containerStatuses[]? + | select(.lastState.terminated.reason == $r) + | .lastState.terminated.exitCode + ] | map(select(. != null)) | .[0] // empty' 2>/dev/null) + fi + fi + + TOP_EVENT_REASONS=$(echo "$ALL_EVENTS" | jq -r ' + .items | map(select(.type == "Warning")) | + group_by(.reason) | + map({reason: .[0].reason, count: length}) | + sort_by(-.count) | .[0:3][] | + " • \(.reason) (×\(.count))"' 2>/dev/null) + + UNHEALTHY_MESSAGE=$(echo "$ALL_EVENTS" | jq -r ' + .items + | map(select(.type == "Warning" and .reason == "Unhealthy")) + | sort_by(.eventTime // .lastTimestamp // .firstTimestamp // .metadata.creationTimestamp // "") + | last + | .message // empty' 2>/dev/null) + + if [[ -z "$FAILURE_REASON" ]]; then + FAILURE_REASON=$(echo "$ALL_EVENTS" | jq -r ' + .items | map(select(.type == "Warning")) | + group_by(.reason) | max_by(length) | .[0].reason // empty' 2>/dev/null) + fi + + local req_memory scope_name health_check_path + req_memory=$(echo "${CONTEXT:-{}}" | jq -r '.scope.capabilities.ram_memory // empty' 2>/dev/null) + scope_name=$(echo "${CONTEXT:-{}}" | jq -r '.scope.name // empty' 2>/dev/null) + health_check_path=$(echo "${CONTEXT:-{}}" | jq -r '.scope.capabilities.health_check.path // "/"' 2>/dev/null) + + case "$FAILURE_REASON" in + ImagePullBackOff|ErrImagePull) + HUMAN_MESSAGE="The container image could not be pulled." + SUGGESTED_FIX="Verify the image name, tag, and registry credentials are correct." ;; + CrashLoopBackOff|BackOff) + HUMAN_MESSAGE="The container started and crashed repeatedly." + SUGGESTED_FIX="Review application logs for startup errors (failed dependencies, bad config, panics)." ;; + OOMKilled) + if [[ -n "$req_memory" ]]; then + HUMAN_MESSAGE="The container exceeded its memory limit (${req_memory}Mi) and was terminated." + else + HUMAN_MESSAGE="The container exceeded its memory limit and was terminated." + fi + SUGGESTED_FIX="Increase ram_memory for scope '$scope_name' or reduce application memory usage." ;; + CreateContainerConfigError) + HUMAN_MESSAGE="The container configuration is invalid." + SUGGESTED_FIX="Check for missing secrets or configmaps referenced by the deployment." ;; + CreateContainerError) + HUMAN_MESSAGE="Kubernetes could not create the container." + SUGGESTED_FIX="Check volumes, permissions, and the pod spec for errors." ;; + RunContainerError) + HUMAN_MESSAGE="The container failed to run its entrypoint." + SUGGESTED_FIX="Verify the start command and that required binaries exist in the image." ;; + ContainerCannotRun) + HUMAN_MESSAGE="The startup binary is missing or not executable inside the image." + SUGGESTED_FIX="Rebuild the image ensuring the entrypoint exists and has execute permissions." ;; + FailedScheduling) + HUMAN_MESSAGE="No node has enough resources or matches the pod's scheduling constraints." + SUGGESTED_FIX="Reduce requested resources, free cluster capacity, or review nodeSelector/affinity rules." ;; + FailedMount|FailedAttachVolume) + HUMAN_MESSAGE="A volume could not be mounted onto the pod." + SUGGESTED_FIX="Check that the referenced PVC, secret, or configmap exists and is accessible." ;; + Unhealthy) + HUMAN_MESSAGE="The application did not pass its health check at $health_check_path." + if [[ -n "$UNHEALTHY_MESSAGE" ]]; then + local translated="" + translated=$(translate_probe_message "$UNHEALTHY_MESSAGE" 2>/dev/null) || translated="" + if [[ -n "$translated" ]]; then + HUMAN_MESSAGE="$HUMAN_MESSAGE Detected: $translated." + else + # Fallback: surface the raw K8s message so context is not lost + HUMAN_MESSAGE="$HUMAN_MESSAGE Detected: $UNHEALTHY_MESSAGE" + fi + fi + if [[ "$UNHEALTHY_MESSAGE" == *"connection refused"* ]]; then + SUGGESTED_FIX="The container is not listening on port 8080 — verify the start command runs, the process binds to 0.0.0.0:8080, and nothing is crashing before it accepts connections." + elif [[ "$UNHEALTHY_MESSAGE" =~ statuscode:[[:space:]]*([0-9]+) ]]; then + SUGGESTED_FIX="The app responded with HTTP ${BASH_REMATCH[1]} on $health_check_path — inspect application logs for startup errors; the process is running but $health_check_path is not returning 2xx." + elif [[ "$UNHEALTHY_MESSAGE" == *"context deadline exceeded"* || "$UNHEALTHY_MESSAGE" == *"Client.Timeout"* || "$UNHEALTHY_MESSAGE" == *"i/o timeout"* ]]; then + SUGGESTED_FIX="The probe timed out — the app may be slow to start or $health_check_path is blocking. Consider increasing startup probe initialDelaySeconds/timeoutSeconds, or making $health_check_path lighter." + else + SUGGESTED_FIX="Ensure the app listens on port 8080 and returns 2xx on $health_check_path within the readiness window." + fi ;; + FailedCreate|FailedCreatePodSandBox) + HUMAN_MESSAGE="Kubernetes could not create the pod sandbox." + SUGGESTED_FIX="Check node health, CNI configuration, and pod security policies." ;; + "") + HUMAN_MESSAGE="" + SUGGESTED_FIX="" ;; + *) + HUMAN_MESSAGE="Pods are failing with reason: $FAILURE_REASON" + SUGGESTED_FIX="" ;; + esac +} + +print_specific_diagnostics() { + if [[ -n "$HUMAN_MESSAGE" ]]; then + log error "" + log error "📋 Reason: $HUMAN_MESSAGE" + fi + + if [[ -n "$FAILURE_REASON" && -n "$FAILURE_CONTAINER" ]]; then + local detail="📋 Detected: $FAILURE_REASON on container $FAILURE_CONTAINER" + if [[ -n "$FAILURE_EXIT_CODE" ]]; then + detail="$detail (exit $FAILURE_EXIT_CODE)" + fi + log error "$detail" + elif [[ -n "$FAILURE_REASON" ]]; then + log error "📋 Detected: $FAILURE_REASON" + fi + + if [[ -n "$FAILURE_MESSAGE" ]]; then + log error "📋 Details: $FAILURE_MESSAGE" + fi + + if [[ -n "${desired:-}" ]]; then + log error "📊 Progress at failure: ${ready:-0}/${desired} ready, ${current:-0}/${desired} available" + fi + + if [[ -n "$TOP_EVENT_REASONS" ]]; then + log error "📋 Recent warnings:" + while IFS= read -r line; do + [[ -n "$line" ]] && log error "$line" + done <<< "$TOP_EVENT_REASONS" + fi + + if [[ -n "$SUGGESTED_FIX" ]]; then + log error "💡 Suggested fix: $SUGGESTED_FIX" + fi +} + +print_generic_hints() { + local health_check_path requested_memory scope_name scope_dimensions + health_check_path=$(echo "$CONTEXT" | jq -r .scope.capabilities.health_check.path) + requested_memory=$(echo "$CONTEXT" | jq -r .scope.capabilities.ram_memory) + scope_name=$(echo "$CONTEXT" | jq -r .scope.name) + scope_dimensions=$(echo "$CONTEXT" | jq -r .scope.dimensions) + + log error "" + log error "⚠️ Application Startup Issue Detected" + log error "" + log error "💡 Possible causes:" + log error " Your application was unable to start within the expected timeframe" + log error "" + log error "🔧 How to fix:" + log error " 1. Port Configuration: Ensure your application listens on port 8080" + log error " 2. Health Check Endpoint: Verify your app responds to: $health_check_path" + log error " 3. Application Logs: Review logs for startup errors (database connections," + log error " missing dependencies, or initialization errors)" + log error " 4. Memory Allocation: Current allocation is ${requested_memory}Mi - increase if needed" + log error " 5. Environment Variables: Verify all required variables are configured in" + log error " parameters for scope '$scope_name' or dimensions: $scope_dimensions" + log error "" +} + +# Run the diagnostic main only when not being sourced just for the helpers. +if [[ "${PRINT_HINTS_LIB_ONLY:-false}" != "true" ]]; then + diagnose_failure + print_specific_diagnostics + + if [[ -z "$SUGGESTED_FIX" ]]; then + print_generic_hints + fi +fi diff --git a/k8s/deployment/tests/print_failed_deployment_hints.bats b/k8s/deployment/tests/print_failed_deployment_hints.bats index 14587515..aae55005 100644 --- a/k8s/deployment/tests/print_failed_deployment_hints.bats +++ b/k8s/deployment/tests/print_failed_deployment_hints.bats @@ -25,12 +25,24 @@ setup() { teardown() { unset CONTEXT + unset K8S_NAMESPACE DEPLOYMENT_ID ALL_EVENTS desired ready current + unset -f kubectl 2>/dev/null || true +} + +assert_not_contains() { + local haystack="$1" + local needle="$2" + if [[ "$haystack" == *"$needle"* ]]; then + echo "Expected output to NOT contain: '$needle'" + echo "Actual: '$haystack'" + return 1 + fi } # ============================================================================= -# Hints Display Test +# Generic Hints (no diagnostic context available) # ============================================================================= -@test "print_failed_deployment_hints: displays complete troubleshooting hints" { +@test "print_failed_deployment_hints: displays generic hints when no diagnostic context available" { run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" [ "$status" -eq 0 ] @@ -49,3 +61,430 @@ teardown() { assert_contains "$output" "my-app" assert_contains "$output" "production" } + +# ============================================================================= +# Pod-derived Diagnostics +# ============================================================================= +@test "print_failed_deployment_hints: identifies OOMKilled and skips generic hints" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"running":{}},"lastState":{"terminated":{"reason":"OOMKilled","exitCode":137,"message":"out of memory"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container exceeded its memory limit (512Mi)" + assert_contains "$output" "📋 Detected: OOMKilled on container app (exit 137)" + assert_contains "$output" "📋 Details: out of memory" + assert_contains "$output" "💡 Suggested fix: Increase ram_memory for scope 'my-app'" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies ImagePullBackOff from waiting state without exit code" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"web","state":{"waiting":{"reason":"ImagePullBackOff","message":"manifest unknown"}},"lastState":{}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container image could not be pulled." + assert_contains "$output" "📋 Detected: ImagePullBackOff on container web" + assert_not_contains "$output" "exit " + assert_contains "$output" "📋 Details: manifest unknown" + assert_contains "$output" "💡 Suggested fix: Verify the image name, tag, and registry credentials" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies CrashLoopBackOff and skips generic hints" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"worker","state":{"waiting":{"reason":"CrashLoopBackOff","message":"back-off 5m0s restarting failed container"}},"lastState":{"terminated":{"exitCode":1}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container started and crashed repeatedly." + assert_contains "$output" "📋 Detected: CrashLoopBackOff on container worker" + assert_contains "$output" "💡 Suggested fix: Review application logs for startup errors" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies CreateContainerConfigError and points to secrets/configmaps" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"CreateContainerConfigError","message":"secret \"db-creds\" not found"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container configuration is invalid." + assert_contains "$output" "💡 Suggested fix: Check for missing secrets or configmaps" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies RunContainerError as entrypoint failure" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"RunContainerError"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container failed to run its entrypoint." + assert_contains "$output" "💡 Suggested fix: Verify the start command" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies ContainerCannotRun as missing binary" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"running":{}},"lastState":{"terminated":{"reason":"ContainerCannotRun","exitCode":127,"message":"exec: \"/app\": no such file"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The startup binary is missing or not executable" + assert_contains "$output" "📋 Detected: ContainerCannotRun on container app (exit 127)" + assert_contains "$output" "💡 Suggested fix: Rebuild the image" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies FailedMount from ALL_EVENTS" { + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"FailedMount","message":"MountVolume.SetUp failed"}]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: A volume could not be mounted onto the pod." + assert_contains "$output" "💡 Suggested fix: Check that the referenced PVC, secret, or configmap exists" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies FailedCreatePodSandBox from ALL_EVENTS" { + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"FailedCreatePodSandBox","message":"failed to create pod sandbox"}]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: Kubernetes could not create the pod sandbox." + assert_contains "$output" "💡 Suggested fix: Check node health, CNI configuration" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies Unhealthy and references the configured health check path" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "did not pass its health check at /health" + assert_contains "$output" "💡 Suggested fix: Ensure the app listens on port 8080 and returns 2xx on /health" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: enriches Unhealthy with connection-refused detail and targeted fix" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: Get \"http://10.0.0.1:8080/health\": dial tcp 10.0.0.1:8080: connect: connection refused"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # HUMAN_MESSAGE retains the base sentence and appends the translated probe failure + assert_contains "$output" "did not pass its health check at /health" + assert_contains "$output" "Detected: Startup probe" + assert_contains "$output" "not yet listening" + # SUGGESTED_FIX is targeted: tells the user the app is not binding the port + assert_contains "$output" "not listening on port 8080" + # Generic fallback fix must NOT appear + assert_not_contains "$output" "returns 2xx on /health within the readiness window" +} + +@test "print_failed_deployment_hints: enriches Unhealthy with HTTP statuscode detail and targeted fix" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: HTTP probe failed with statuscode: 502"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "Detected: Startup probe" + assert_contains "$output" "HTTP 502" + # SUGGESTED_FIX cites the status code and points to app logs + assert_contains "$output" "responded with HTTP 502" + assert_contains "$output" "inspect application logs" +} + +@test "print_failed_deployment_hints: enriches Unhealthy with timeout detail and targeted fix" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: Get \"http://10.0.0.1:8080/health\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "Detected: Startup probe" + assert_contains "$output" "timed out" + # SUGGESTED_FIX mentions timing knobs + assert_contains "$output" "initialDelaySeconds" +} + +@test "print_failed_deployment_hints: falls back to raw Unhealthy message when translation is impossible" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + # Message does not match any known probe pattern → translate_probe_message returns non-zero. + # The raw text must still be surfaced in the hint instead of being silently dropped. + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"completely unknown probe failure format from a future K8s"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # Raw message appears verbatim in the reason line + assert_contains "$output" "completely unknown probe failure format from a future K8s" + # Base sentence is still there + assert_contains "$output" "did not pass its health check at /health" +} + +@test "print_failed_deployment_hints: Unhealthy picks the latest event when multiple are present" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + # Two Warnings: an older 502 and a newer connection-refused. The fix must reflect the newer one. + export ALL_EVENTS='{"items":[ + {"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:10:00Z","message":"Startup probe failed: HTTP probe failed with statuscode: 502"}, + {"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: Get \"http://10.0.0.1:8080/health\": dial tcp: connect: connection refused"} + ]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # Latest event wins → connection-refused remediation, not the older HTTP 502 one + assert_contains "$output" "not listening on port 8080" + assert_not_contains "$output" "responded with HTTP 502" +} + +# ============================================================================= +# CONTEXT fallback handling +# ============================================================================= +@test "print_failed_deployment_hints: OOMKilled without ram_memory does not leave dangling (Mi)" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + # CONTEXT present but no ram_memory capability — plausible if the scope did not define memory. + export CONTEXT='{"scope":{"name":"my-app","dimensions":"prod","capabilities":{"health_check":{"path":"/health"}}}}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","lastState":{"terminated":{"reason":"OOMKilled","exitCode":137}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "exceeded its memory limit" + # The (Mi) parenthetical must not appear empty when ram_memory is missing. + assert_not_contains "$output" "(Mi)" +} + +@test "print_failed_deployment_hints: applies CONTEXT defaults gracefully when CONTEXT is unset" { + # Drop the bats-provided CONTEXT so we exercise the ${CONTEXT:-{}} fallback. + unset CONTEXT + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # health_check_path default "/" must apply when CONTEXT is unset. + assert_contains "$output" "health check at /." + assert_contains "$output" "returns 2xx on /" + # Guard against the previous escape bug: a literal backslash in the message + # would indicate jq received {\} instead of {} and silently failed. + assert_not_contains "$output" "{\\" +} + +# ============================================================================= +# Unknown Reason → falls through to generic checklist +# ============================================================================= +@test "print_failed_deployment_hints: unknown reason still prints generic hints alongside specific reason" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"WeirdNewError"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: Pods are failing with reason: WeirdNewError" + assert_contains "$output" "📋 Detected: WeirdNewError on container app" + # No suggested fix → fall through to generic checklist. + assert_not_contains "$output" "💡 Suggested fix:" + assert_contains "$output" "⚠️ Application Startup Issue Detected" + assert_contains "$output" "🔧 How to fix:" +} + +# ============================================================================= +# Event-derived Diagnostics (no pods to inspect) +# ============================================================================= +@test "print_failed_deployment_hints: derives FailedScheduling from ALL_EVENTS when pods unavailable" { + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"FailedScheduling"},{"type":"Warning","reason":"FailedScheduling"}]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: No node has enough resources" + assert_contains "$output" "📋 Detected: FailedScheduling" + assert_contains "$output" "💡 Suggested fix: Reduce requested resources" + assert_not_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: shows top warning event reasons summary" { + export ALL_EVENTS='{"items":[ + {"type":"Warning","reason":"BackOff"}, + {"type":"Warning","reason":"BackOff"}, + {"type":"Warning","reason":"BackOff"}, + {"type":"Warning","reason":"FailedMount"}, + {"type":"Warning","reason":"FailedMount"}, + {"type":"Warning","reason":"Unhealthy"}, + {"type":"Normal","reason":"Pulled"} + ]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Recent warnings:" + assert_contains "$output" "BackOff (×3)" + assert_contains "$output" "FailedMount (×2)" + assert_contains "$output" "Unhealthy (×1)" + # Normal events should not be summarized + assert_not_contains "$output" "Pulled (×" +} + +# ============================================================================= +# Replica progress reporting +# ============================================================================= +@test "print_failed_deployment_hints: includes replica progress when desired/ready/current are set" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export desired=3 ready=1 current=2 + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📊 Progress at failure: 1/3 ready, 2/3 available" +} diff --git a/k8s/deployment/tests/translate_probe_message.bats b/k8s/deployment/tests/translate_probe_message.bats new file mode 100644 index 00000000..2ff9de51 --- /dev/null +++ b/k8s/deployment/tests/translate_probe_message.bats @@ -0,0 +1,142 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for deployment/translate_probe_message - K8s probe message parser +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + # Load helpers only (skip the diagnostic main inside the hints script) + PRINT_HINTS_LIB_ONLY=true source "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + unset PRINT_HINTS_LIB_ONLY +} + +# ----------------------------------------------------------------------------- +# Connection refused +# ----------------------------------------------------------------------------- +@test "translate_probe_message: startup probe connection refused with path" { + run translate_probe_message 'Startup probe failed: Get "http://10.15.28.102:8080/health": dial tcp 10.15.28.102:8080: connect: connection refused' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "not yet listening" + assert_contains "$output" "/health" +} + +@test "translate_probe_message: liveness probe connection refused" { + run translate_probe_message 'Liveness probe failed: Get "http://10.0.0.5:8080/ping": dial tcp: connect: connection refused' + + [ "$status" -eq 0 ] + assert_contains "$output" "Liveness probe" + assert_contains "$output" "not yet listening" + assert_contains "$output" "/ping" +} + +# ----------------------------------------------------------------------------- +# HTTP status codes +# ----------------------------------------------------------------------------- +@test "translate_probe_message: startup probe HTTP 502" { + run translate_probe_message 'Startup probe failed: HTTP probe failed with statuscode: 502' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "HTTP 502" +} + +@test "translate_probe_message: readiness probe HTTP 404" { + run translate_probe_message 'Readiness probe failed: HTTP probe failed with statuscode: 404' + + [ "$status" -eq 0 ] + assert_contains "$output" "Readiness probe" + assert_contains "$output" "HTTP 404" +} + +# ----------------------------------------------------------------------------- +# Timeout +# ----------------------------------------------------------------------------- +@test "translate_probe_message: startup probe timeout" { + run translate_probe_message 'Startup probe failed: Get "http://10.0.0.5:8080/health": context deadline exceeded (Client.Timeout exceeded while awaiting headers)' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "timed out" + assert_contains "$output" "/health" +} + +# ----------------------------------------------------------------------------- +# Non-probe messages +# ----------------------------------------------------------------------------- +@test "translate_probe_message: returns non-zero for non-probe messages" { + run translate_probe_message 'Failed to pull image "nginx:latest"' + + [ "$status" -ne 0 ] + [ -z "$output" ] +} + +@test "translate_probe_message: returns non-zero for empty input" { + run translate_probe_message '' + + [ "$status" -ne 0 ] +} + +# ----------------------------------------------------------------------------- +# Fallback for unknown probe failure shapes +# ----------------------------------------------------------------------------- +@test "translate_probe_message: generic fallback when probe failure mode is unrecognized" { + run translate_probe_message 'Startup probe failed: some weird new error format' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" +} + +# ----------------------------------------------------------------------------- +# parse_probe_message — structured output for consolidation +# ----------------------------------------------------------------------------- +@test "parse_probe_message: emits pipe-separated kind, path, mode for connection refused" { + run parse_probe_message 'Startup probe failed: Get "http://10.0.0.1:8080/health": dial tcp: connect: connection refused' + + [ "$status" -eq 0 ] + [ "$output" = "Startup|/health|not yet listening" ] +} + +@test "parse_probe_message: emits 'responded HTTP ' mode with empty path field preserved" { + run parse_probe_message 'Startup probe failed: HTTP probe failed with statuscode: 502' + + [ "$status" -eq 0 ] + # Empty path between two pipes must be preserved so callers can read 3 fields. + # Mode reads as a verb so it composes inline with other modes in one sentence. + [ "$output" = "Startup||responded HTTP 502 (expected 2xx)" ] +} + +@test "parse_probe_message: returns non-zero for non-probe input" { + run parse_probe_message 'Failed to pull image' + [ "$status" -ne 0 ] +} + +# ----------------------------------------------------------------------------- +# short_pod_name — strip K8S_DEPLOYMENT_NAME prefix +# ----------------------------------------------------------------------------- +@test "short_pod_name: strips deployment prefix and marks truncation with '...'" { + K8S_DEPLOYMENT_NAME="d-326230662-1916903584" + run short_pod_name "d-326230662-1916903584-8578df9b4c-hhshq" + + [ "$status" -eq 0 ] + # Leading '...' tells the operator the name was shortened + [ "$output" = "...8578df9b4c-hhshq" ] +} + +@test "short_pod_name: returns full name when prefix env is unset" { + unset K8S_DEPLOYMENT_NAME + run short_pod_name "some-pod-name-abc" + + [ "$status" -eq 0 ] + [ "$output" = "some-pod-name-abc" ] +} + +@test "short_pod_name: returns full name when pod does not match the prefix" { + K8S_DEPLOYMENT_NAME="d-1-2" + run short_pod_name "unrelated-pod-xyz" + + [ "$status" -eq 0 ] + [ "$output" = "unrelated-pod-xyz" ] +} diff --git a/k8s/deployment/tests/wait_deployment_active.bats b/k8s/deployment/tests/wait_deployment_active.bats index 5983ec19..c1061ac2 100644 --- a/k8s/deployment/tests/wait_deployment_active.bats +++ b/k8s/deployment/tests/wait_deployment_active.bats @@ -114,6 +114,42 @@ teardown() { assert_contains "$output" "📋 Timeout: 5s (max 0 iterations)" assert_contains "$output" "❌ Timeout waiting for deployment" assert_contains "$output" "📋 Maximum iterations (0) reached" + # Timeout path must source print_failed_deployment_hints; with no pod info + # and no events, it falls through to the generic checklist. + assert_contains "$output" "⚠️ Application Startup Issue Detected" +} + +@test "wait_deployment_active: surfaces specific failure reason on timeout when pod info is available" { + export TIMEOUT=5 + + kubectl() { + case "$*" in + "get deployment d-scope-123-deploy-456 -n test-namespace -o json") + echo '{"spec":{"replicas":3},"status":{"availableReplicas":0,"updatedReplicas":0,"readyReplicas":0}}' + ;; + "get pods -n test-namespace -l deployment_id=deploy-456 -o json") + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"running":{}},"lastState":{"terminated":{"reason":"OOMKilled","exitCode":137,"message":"out of memory"}}}]}}]}' + ;; + "get pods"*) + echo "" + ;; + "get events"*) + echo '{"items":[]}' + ;; + esac + } + export -f kubectl + + export CONTEXT='{"scope":{"name":"my-app","dimensions":"prod","capabilities":{"health_check":{"path":"/health"},"ram_memory":512}}}' + + run bash "$BATS_TEST_DIRNAME/../wait_deployment_active" + + [ "$status" -eq 1 ] + assert_contains "$output" "❌ Timeout waiting for deployment" + # The hint script must read pod state and surface the user-friendly reason + assert_contains "$output" "📋 Reason: The container exceeded its memory limit" + assert_contains "$output" "📋 Detected: OOMKilled on container app (exit 137)" + assert_contains "$output" "💡 Suggested fix: Increase ram_memory for scope 'my-app'" } # ============================================================================= @@ -159,6 +195,8 @@ teardown() { [ "$status" -eq 1 ] assert_contains "$output" "❌ Deployment is no longer running (status: failed)" + # Non-running status path must also source print_failed_deployment_hints + assert_contains "$output" "⚠️ Application Startup Issue Detected" } # ============================================================================= @@ -221,6 +259,7 @@ teardown() { [ "$status" -eq 1 ] assert_contains "$output" "Deployment status - Available: 3/5, Updated: 4/5, Ready: 3/5" + assert_contains "$output" "⏳ Still waiting — Ready: 3/5, Available: 3/5 (attempt 1/1, 10s elapsed)" assert_contains "$output" "❌ Timeout waiting for deployment" } @@ -345,3 +384,322 @@ teardown() { [ "$status" -eq 0 ] assert_equal "$output" "6" } + +# ============================================================================= +# Heartbeat Tests +# ============================================================================= +@test "wait_deployment_active: logs heartbeat every 10% of timeout with progress info" { + # TIMEOUT=100 -> MAX_ITERATIONS=10 -> HEARTBEAT_INTERVAL=1 (every iteration) + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":2},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods\"*) echo '' ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=100 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # First heartbeat always at iteration 1 + assert_contains "$output" "⏳ Still waiting — Ready: 0/2, Available: 0/2 (attempt 1/10, 10s elapsed)" + # Mid-progress + assert_contains "$output" "(attempt 5/10, 50s elapsed)" + # Last iteration before timeout + assert_contains "$output" "(attempt 10/10, 100s elapsed)" +} + +@test "wait_deployment_active: heartbeat interval clamps to >=1 for short timeouts" { + # TIMEOUT=30 -> MAX_ITERATIONS=3 -> HEARTBEAT_INTERVAL would be 0, must clamp to 1 + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods\"*) echo '' ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=30 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # All three iterations should emit a heartbeat (interval clamped to 1) + assert_contains "$output" "(attempt 1/3, 10s elapsed)" + assert_contains "$output" "(attempt 2/3, 20s elapsed)" + assert_contains "$output" "(attempt 3/3, 30s elapsed)" +} + +@test "wait_deployment_active: heartbeat is suppressed when deployment is ready on iteration 1" { + # Default mocks: deployment is ready immediately, so heartbeat should NOT fire. + run bash "$BATS_TEST_DIRNAME/../wait_deployment_active" + + [ "$status" -eq 0 ] + assert_contains "$output" "✅ All pods in deployment 'd-scope-123-deploy-456' are available and ready!" + # No heartbeat emitted because the ready-check breaks before it + if [[ "$output" == *"Still waiting"* ]]; then + echo "Expected output to NOT contain 'Still waiting' on success path" + echo "Actual: $output" + return 1 + fi +} + +# ============================================================================= +# Unhealthy Translation Tests +# ============================================================================= +@test "wait_deployment_active: translates Unhealthy connection-refused into human line during polling" { + # Use a far-future timestamp so the event is not filtered out by the now() initialization. + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[{\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: Get \\\"http://10.0.0.1:8080/health\\\": dial tcp 10.0.0.1:8080: connect: connection refused\"}]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # Translated form must appear + assert_contains "$output" "Startup probe" + assert_contains "$output" "not yet listening" + assert_contains "$output" "/health" + # Raw connection-refused text must NOT leak through + if [[ "$output" == *"connection refused"* ]]; then + echo "Expected output to NOT contain raw 'connection refused' (should be translated)" + echo "Actual: $output" + return 1 + fi +} + +@test "wait_deployment_active: consolidates multiple Unhealthy events for same pod into a single line" { + # The kubelet often emits two Unhealthy events per probe round (connection refused + # + HTTP 502 from a sidecar). The polling loop must merge them into one log line + # with both failure modes combined, using the short pod name. + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc-hhshq' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[ + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: Get \\\"http://10.0.0.1:8080/health\\\": dial tcp: connect: connection refused\"}, + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: HTTP probe failed with statuscode: 502\"} + ]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # One consolidated line with both modes joined by ', ' for natural reading + assert_contains "$output" "Startup probe failing on /health — not yet listening, responded HTTP 502 (expected 2xx)" + # Pod name must be the short form with '...' prefix marking truncation + assert_contains "$output" "Pod/...abc-hhshq" + # The long prefix must NOT appear in any logged event line + if [[ "$output" == *"Pod/d-scope-123-deploy-456-abc-hhshq"* ]]; then + echo "Expected output to use short pod name, not the full prefix" + echo "Actual: $output" + return 1 + fi + # And we must see only ONE consolidated line, not one per mode. + local lines + lines=$(printf '%s\n' "$output" | grep -c "Startup probe failing" || true) + if [ "$lines" -ne 1 ]; then + echo "Expected exactly 1 consolidated 'Startup probe failing' line, got $lines" + echo "Actual: $output" + return 1 + fi +} + +@test "wait_deployment_active: falls back to raw messages when parse_probe_message cannot translate" { + # Two Unhealthy events whose messages do NOT match any known probe pattern. + # Consolidation must fail and the raw text must be preserved for the operator. + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc-hhshq' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[ + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"some brand-new K8s probe format we cannot parse 1\"}, + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"another unknown probe format 2\"} + ]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # Both original messages must appear verbatim + assert_contains "$output" "some brand-new K8s probe format we cannot parse 1" + assert_contains "$output" "another unknown probe format 2" + # The consolidated header must NOT appear because parsing failed + if [[ "$output" == *"probe failing"* ]]; then + echo "Expected fallback path to NOT emit the 'probe failing' header" + echo "Actual: $output" + return 1 + fi +} + +@test "wait_deployment_active: translates Unhealthy HTTP statuscode into human line during polling" { + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[{\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: HTTP probe failed with statuscode: 502\"}]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "HTTP 502" +} + +# ============================================================================= +# Latest Timestamp Initialization +# ============================================================================= +@test "wait_deployment_active: skips K8s events older than script start time" { + # An event from 2020 must be filtered out because LATEST_TIMESTAMP is initialized + # to now() — prevents stale events from previous workflow retries leaking through. + kubectl() { + case "$*" in + "get deployment"*"-o json"*) + echo '{ + "spec": {"replicas": 3}, + "status": { + "availableReplicas": 3, + "updatedReplicas": 3, + "readyReplicas": 3 + } + }' + ;; + "get pods"*) + echo "" + ;; + "get events"*"Deployment"*) + # A very old event that should be suppressed + echo '{"items":[{"effectiveTimestamp":"2020-01-01T00:00:00Z","type":"Warning","involvedObject":{"kind":"Pod","name":"d-scope-123-deploy-456-abc"},"reason":"Unhealthy","message":"old stale warning"}]}' + ;; + "get events"*) + echo '{"items":[]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../wait_deployment_active" + + [ "$status" -eq 0 ] + # The 2020 event must not appear in output + if [[ "$output" == *"old stale warning"* ]]; then + echo "Expected output to NOT contain stale 2020 warning" + echo "Actual: $output" + return 1 + fi + if [[ "$output" == *"2020-01-01T00:00:00Z"* ]]; then + echo "Expected output to NOT contain stale 2020 timestamp" + echo "Actual: $output" + return 1 + fi +} diff --git a/k8s/deployment/wait_deployment_active b/k8s/deployment/wait_deployment_active index c242b03f..478e4215 100755 --- a/k8s/deployment/wait_deployment_active +++ b/k8s/deployment/wait_deployment_active @@ -1,12 +1,63 @@ #!/bin/bash +# Load probe helpers without firing the diagnostic main of the hints script. +PRINT_HINTS_LIB_ONLY=true source "$SERVICE_PATH/deployment/print_failed_deployment_hints" +unset PRINT_HINTS_LIB_ONLY + +# Try to print one consolidated line for a pod's grouped Unhealthy events. +# Returns non-zero if any message cannot be parsed or if no probe kind was +# detected — callers fall back to log_unhealthy_raw to preserve the original +# text instead of silently dropping events. +log_unhealthy_group() { + local ts="$1" pod_name="$2" messages_concat="$3" + local kind="" path="" modes="" + local msg parsed k p m short path_suffix + + while IFS= read -r msg; do + [ -z "$msg" ] && continue + parsed=$(parse_probe_message "$msg" 2>/dev/null) || return 1 + [ -z "$parsed" ] && return 1 + IFS='|' read -r k p m <<< "$parsed" + [ -n "$k" ] && kind="$k" + [ -n "$p" ] && path="$p" + if [[ "$modes" != *"$m"* ]]; then + if [ -z "$modes" ]; then + modes="$m" + else + modes="$modes, $m" + fi + fi + done < <(printf '%s\n' "$messages_concat" | tr '\001' '\n') + + [ -z "$kind" ] && return 1 + + short=$(short_pod_name "$pod_name" 2>/dev/null) || short="$pod_name" + path_suffix="" + [ -n "$path" ] && path_suffix=" on $path" + log warn "$ts [Warning] Pod/$short ${kind} probe failing${path_suffix} — ${modes}" + return 0 +} + +# Fallback: emit one raw warning line per original message in the group. +log_unhealthy_raw() { + local ts="$1" pod_name="$2" messages_concat="$3" + local msg + while IFS= read -r msg; do + [ -z "$msg" ] && continue + log warn "$ts [Warning] Pod/$pod_name: Unhealthy - $msg" + done < <(printf '%s\n' "$messages_concat" | tr '\001' '\n') +} + MAX_ITERATIONS=$(( TIMEOUT / 10 )) K8S_DEPLOYMENT_NAME="d-$SCOPE_ID-$DEPLOYMENT_ID" iteration=0 -LATEST_TIMESTAMP="" +LATEST_TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") SKIP_DEPLOYMENT_STATUS_CHECK="${SKIP_DEPLOYMENT_STATUS_CHECK:=false}" +HEARTBEAT_INTERVAL=$(( MAX_ITERATIONS / 10 )) +[ "$HEARTBEAT_INTERVAL" -lt 1 ] && HEARTBEAT_INTERVAL=1 + log debug "🔍 Waiting for deployment '$K8S_DEPLOYMENT_NAME' to become active..." log debug "📋 Namespace: $K8S_NAMESPACE" log debug "📋 Timeout: ${TIMEOUT}s (max $MAX_ITERATIONS iterations)" @@ -25,7 +76,7 @@ while true; do fi log debug "📡 Checking deployment status (attempt $iteration/$MAX_ITERATIONS)..." - D_STATUS=$(np deployment read --id $DEPLOYMENT_ID --api-key $NP_API_KEY --query .status 2>&1) || { + D_STATUS=$(np deployment read --id "$DEPLOYMENT_ID" --api-key "$NP_API_KEY" --query .status 2>&1) || { log error " ❌ Failed to read deployment status" log error "📋 NP CLI error: $D_STATUS" exit 1 @@ -39,6 +90,7 @@ while true; do if [ "$SKIP_DEPLOYMENT_STATUS_CHECK" != true ]; then if [[ $D_STATUS != "running" && $D_STATUS != "waiting_for_instances" ]]; then log error " ❌ Deployment is no longer running (status: $D_STATUS)" + source "$SERVICE_PATH/deployment/print_failed_deployment_hints" exit 1 fi fi @@ -60,6 +112,11 @@ while true; do break fi + if [ "$iteration" -eq 1 ] || [ $(( iteration % HEARTBEAT_INTERVAL )) -eq 0 ]; then + elapsed_s=$(( iteration * 10 )) + log info "⏳ Still waiting — Ready: $ready/$desired, Available: $current/$desired (attempt $iteration/$MAX_ITERATIONS, ${elapsed_s}s elapsed)" + fi + POD_SELECTOR="deployment_id=${DEPLOYMENT_ID}" POD_NAMES=$(kubectl get pods -n $K8S_NAMESPACE -l $POD_SELECTOR -o jsonpath='{.items[*].metadata.name}') # Get events for the deployment first @@ -89,26 +146,50 @@ while true; do # Find the newest timestamp in all events NEWEST_TIMESTAMP=$(echo "$PROCESSED_EVENTS" | jq -r '.items | map(.effectiveTimestamp) | max // empty') - # Process events with jq, showing only events newer than what we've seen - # Output format: TYPEmessage — so we can route Warning events to log warn - NEW_EVENTS=$(echo "$PROCESSED_EVENTS" | jq -r --arg timestamp "$LATEST_TIMESTAMP" ' + # Non-Unhealthy events: emit one line each (current behavior). + OTHER_EVENTS=$(echo "$PROCESSED_EVENTS" | jq -r --arg timestamp "$LATEST_TIMESTAMP" ' .items | sort_by(.effectiveTimestamp) | .[] | select($timestamp == "" or (.effectiveTimestamp > $timestamp)) | - "\(.type)\t\(.effectiveTimestamp) [\(.type)] \(.involvedObject.kind)/\(.involvedObject.name): \(.reason) - \(.message)" + select(.reason != "Unhealthy") | + "\(.type)\t\(.effectiveTimestamp)\t\(.involvedObject.kind)\t\(.involvedObject.name)\t\(.reason)\t\((.message // "") | gsub("[\t\n]"; " "))" ') - # If we have new events, show them and update the timestamp - if [ ! -z "$NEW_EVENTS" ]; then - while IFS=$'\t' read -r event_type event_line; do + if [ -n "$OTHER_EVENTS" ]; then + while IFS=$'\t' read -r event_type ts kind name reason message; do + short=$(short_pod_name "$name") + event_line="$ts [$event_type] $kind/$short: $reason - $message" if [ "$event_type" = "Warning" ]; then log warn "$event_line" else log debug "$event_line" fi - done <<< "$NEW_EVENTS" - # Store the newest timestamp for next iteration + done <<< "$OTHER_EVENTS" + fi + + # Unhealthy events: group by pod and consolidate every failure mode for that + # pod into a single line. Messages within a group are joined by U+0001 (SOH), + # a byte that cannot appear in K8s event text. + UNHEALTHY_GROUPS=$(echo "$PROCESSED_EVENTS" | jq -r --arg timestamp "$LATEST_TIMESTAMP" ' + [.items[] + | select($timestamp == "" or (.effectiveTimestamp > $timestamp)) + | select(.reason == "Unhealthy")] + | group_by(.involvedObject.name) + | .[] + | "\((max_by(.effectiveTimestamp)).effectiveTimestamp)\t\(.[0].involvedObject.name)\t\([.[].message] | map(gsub("[\t\n\u0001]"; " ")) | join("\u0001"))" + ') + + if [ -n "$UNHEALTHY_GROUPS" ]; then + while IFS=$'\t' read -r ts pod_name messages_concat; do + [ -z "$pod_name" ] && continue + log_unhealthy_group "$ts" "$pod_name" "$messages_concat" \ + || log_unhealthy_raw "$ts" "$pod_name" "$messages_concat" + done <<< "$UNHEALTHY_GROUPS" + fi + + # Advance cursor if any new events were processed in this iteration. + if [ -n "$OTHER_EVENTS" ] || [ -n "$UNHEALTHY_GROUPS" ]; then LATEST_TIMESTAMP="$NEWEST_TIMESTAMP" log debug "Updated timestamp to: $LATEST_TIMESTAMP" fi From 7c5bb6483a80e1b1cb76a02b37fb7a647332a214 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:02:50 -0300 Subject: [PATCH 31/56] docs: add design spec for CLIEN-781 memory & CPU limits --- ...5-21-clien-781-memory-cpu-limits-design.md | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md diff --git a/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md b/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md new file mode 100644 index 00000000..c5cbaa0c --- /dev/null +++ b/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md @@ -0,0 +1,163 @@ +# CLIEN-781 — Configurable CPU & RAM limits for k8s scope + +Status: design approved (2026-05-21) +Ticket: https://nullplatform.atlassian.net/browse/CLIEN-781 +Client: Spin +Assignee: Federico Maleh + +## Context + +Today the k8s scope exposes two capabilities — `ram_memory` and `cpu_millicores` — that are used as **both** the Kubernetes request and the Kubernetes limit. The Spin team needs to decouple them so that limits can be set higher than requests when desired, while keeping the default behavior unchanged for existing scopes. + +The risk that drives the UI shape: a memory `limit > request` increases the chance the scheduler/OOMKiller kills a pod under pressure. So memory limit is a sharp tool that should be hidden behind an "advanced" surface, not the main form. + +## Goals + +1. Add `cpu_millicores_limit` and `ram_memory_limit` as optional capabilities. +2. Keep the main form intact — `ram_memory` (request) stays at the top, untouched. +3. Group the new fields with the existing `cpu_millicores` in a renamed `Resources` tab inside the collapsable "ADVANCED" categorization. +4. Validate `limit >= request` at the JSON schema layer. +5. Be backwards compatible: missing or null limit ⇒ fall back to the request value, matching today's render. + +## Non-goals + +- No change to `ram_memory` or `cpu_millicores` themselves (same field types, same defaults). +- No cross-scope validation. +- No docsite update in this ticket (separate PR if requested). +- No CLI/API change beyond what naturally happens by adding properties to the scope spec. + +## UI design + +### Form layout (after the change) + +``` +Main form +├─ RAM Memory (request, dropdown — unchanged) +└─ Visibility + +▼ ADVANCED +├─ Resources ← renamed from "Processor" +│ ├─ CPU Millicores (request — existing) +│ ├─ CPU Millicores Limit ← NEW (optional integer) +│ └─ RAM Memory Limit ← NEW (dropdown with "Same as request") +├─ Size & Scaling +├─ Exposed Ports +├─ Scheduled Stop +├─ Protocol +├─ Continuous deployment +└─ Health Check +``` + +Asymmetry between RAM and CPU is intentional: RAM request stays in the main form (everyone tunes it), RAM limit lives in `Resources` (sharp tool). CPU request and CPU limit both live in `Resources` (CPU was already advanced). + +### Tab rename rationale + +`Resources` follows Kubernetes vocabulary (`resources: requests/limits`) and is generic enough to host both CPU and memory tuning. Alternatives considered (`Compute`, `Compute & Limits`) were rejected as less standard. + +## Schema changes — `k8s/specs/service-spec.json.tpl` + +### New properties (siblings of the existing ones) + +```json +"cpu_millicores_limit": { + "type": ["integer", "null"], + "title": "CPU Millicores Limit", + "default": null, + "maximum": 4000, + "minimum": { "$data": "1/cpu_millicores" }, + "description": "Maximum CPU the container can use. Leave empty to use the same value as the request." +}, +"ram_memory_limit": { + "type": ["integer", "null"], + "title": "RAM Memory Limit", + "default": null, + "oneOf": [ + { "const": null, "title": "Same as request" }, + { "const": 64, "title": "64 MB" }, + { "const": 128, "title": "128 MB" }, + { "const": 256, "title": "256 MB" }, + { "const": 512, "title": "512 MB" }, + { "const": 1024, "title": "1 GB" }, + { "const": 2048, "title": "2 GB" }, + { "const": 4096, "title": "4 GB" }, + { "const": 8192, "title": "8 GB" }, + { "const": 16384, "title": "16 GB" } + ], + "minimum": { "$data": "1/ram_memory" }, + "description": "Maximum memory the container can use. Setting this higher than the request increases OOMKill risk." +} +``` + +Neither property is added to the `required` array of `attributes.schema` — both are optional. + +### uiSchema changes + +Two edits in the existing `Categorization` block: + +1. Change `"label": "Processor"` → `"label": "Resources"`. +2. Add two `Control` entries inside that category's `elements`: + +```json +{ + "type": "Category", + "label": "Resources", + "elements": [ + { "type": "Control", "label": "CPU Millicores", "scope": "#/properties/cpu_millicores" }, + { "type": "Control", "label": "CPU Millicores Limit", "scope": "#/properties/cpu_millicores_limit" }, + { "type": "Control", "label": "RAM Memory Limit", "scope": "#/properties/ram_memory_limit" } + ] +} +``` + +No SHOW/HIDE rules are needed — the "Same as request" option (RAM) and empty value (CPU) act as the no-op state. + +## Validation + +`minimum` with `$data` references the sibling request field. JSON Schema only applies `minimum` to numeric instances, so `null` (or missing) values skip the check naturally — no `if/then` block required. + +The pattern matches the precedent already in this spec: +`health_check.period_seconds.exclusiveMinimum.$data = "1/timeout_seconds"`. + +## Render logic in the deployment template + +The k8s deployment manifest (currently rendering both request and limit from the same capability) must use the new fields with a jq `// fallback`: + +```bash +CPU_REQ=$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores') +CPU_LIM=$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit // .scope.capabilities.cpu_millicores') + +RAM_REQ=$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory') +RAM_LIM=$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit // .scope.capabilities.ram_memory') +``` + +`// .scope.capabilities.cpu_millicores` evaluates to the request value when the limit is `null` or missing, giving the exact retrocompat the ticket asks for. + +The implementation plan will locate the exact file(s) under `k8s/deployment/` that render `resources:` and apply this change. + +## Backwards compatibility + +| Scenario | Behavior | +|---|---| +| Existing scope, no new properties in DB | jq fallback ⇒ limit = request ⇒ identical manifest to today | +| New scope, user does not touch limits | Defaults are `null` ⇒ same as above | +| New scope, user picks a higher limit | Manifest renders the explicit limit; schema validates `limit ≥ request` | +| User tries `limit < request` | JSON schema rejects via `$data` minimum before the workflow runs | + +No data migration needed. + +## Testing plan (high-level) + +- **BATS unit tests** for the deployment script: cover the four matrix cells (limit set / limit null, for both CPU and RAM), asserting the rendered `resources:` block. +- **JSON schema validation tests** (if a test harness exists for the spec): assert that `limit < request` is rejected and `limit >= request` is accepted, including the `null` case. +- **Manual smoke** in a dev environment after the implementation lands. + +The testing detail belongs to the implementation plan (writing-plans), not this design doc. + +## Open questions + +- Exact deployment template file location and templating engine (gomplate vs helm vs raw bash + jq) — to be confirmed at implementation time. The render logic above is engine-agnostic in spirit but the syntax will be adapted. + +## Out of scope / follow-ups + +- Docsite documentation (under `~/nullplatform/apps/docsite/`) — separate ticket if Spin needs it user-facing. +- Symmetric treatment for other resource dimensions (ephemeral storage, GPUs) — not requested. From feb746a39b9652c53c6d1495962f0c6b00c6dc53 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:10:52 -0300 Subject: [PATCH 32/56] docs: add implementation plan for CLIEN-781 memory & CPU limits --- .../2026-05-21-clien-781-memory-cpu-limits.md | 525 ++++++++++++++++++ 1 file changed, 525 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md diff --git a/docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md b/docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md new file mode 100644 index 00000000..f954d73b --- /dev/null +++ b/docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md @@ -0,0 +1,525 @@ +# CLIEN-781 — Memory & CPU Limits Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add optional `cpu_millicores_limit` and `ram_memory_limit` capabilities to the k8s scope so the Spin client can set Kubernetes `resources.limits` independently from `resources.requests`, with safe back-compat defaults. + +**Architecture:** Add two new optional properties to the k8s scope spec. Normalize them inside `build_context` (limit defaults to request when null/missing) so the deployment template stays trivial. Render the normalized values into the application container's `resources.limits` while keeping `resources.requests` bound to the original `cpu_millicores` / `ram_memory` fields. + +**Tech Stack:** JSON Schema (with JSONForms uiSchema), bash + jq for context normalization, gomplate for template rendering, BATS for tests. + +**Spec:** [`docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md`](../specs/2026-05-21-clien-781-memory-cpu-limits-design.md) + +--- + +## File Structure + +**Modified files:** + +- `k8s/specs/service-spec.json.tpl` — add two `properties` and update the `Categorization`/`Category` to rename "Processor" → "Resources" and add two new `Control` entries. +- `k8s/deployment/build_context` — add a `normalize_capability_limits` function that mutates `$CONTEXT` to fill `.scope.capabilities.cpu_millicores_limit` and `.scope.capabilities.ram_memory_limit` with the request value when null/missing. Call it before the final context write. +- `k8s/deployment/templates/deployment.yaml.tpl` — application container only (lines 313–319): keep `requests.cpu/memory` bound to `cpu_millicores` / `ram_memory`, change `limits.cpu/memory` to read `cpu_millicores_limit` / `ram_memory_limit`. Sidecars (lines 148–153, 201–206, 255–260) are NOT touched — they use `container_cpu_in_millicores` / `container_memory_in_memory` from a ConfigMap. + +**New tests:** + +- `k8s/deployment/tests/build_context.bats` — add a section for `normalize_capability_limits` covering the four matrix cells (limit set / limit null, for CPU and RAM) plus the "field absent" case. +- `k8s/deployment/tests/deployment_template_shape.bats` (new file) — grep-based structural assertions that the application container `resources` block uses the right field for request vs limit. Mirrors `tests/ingress_template_shape.bats`. + +**Not modified:** sidecar resource blocks, CLI, docsite, API spec. + +--- + +## Task 1: Add `cpu_millicores_limit` and `ram_memory_limit` properties to the JSON schema + +**Files:** +- Modify: `k8s/specs/service-spec.json.tpl` (properties block, lines 485–492 area for CPU; lines 315–358 area for RAM) + +There is no JSON-schema test harness in this repo, so this task has no automated test. The schema is validated implicitly by the deployment workflow and by manual `jq` sanity checks in step 2. + +- [ ] **Step 1: Add the two new properties to `attributes.schema.properties`** + +After the existing `cpu_millicores` property block (end at line 492), add `cpu_millicores_limit`: + +```json +, +"cpu_millicores_limit":{ + "type":["integer","null"], + "title":"CPU Millicores Limit", + "default":null, + "maximum":4000, + "minimum":{ + "$data":"1/cpu_millicores" + }, + "description":"Maximum CPU the container can use (in millicores). Leave empty to use the same value as the request." +} +``` + +After the existing `ram_memory` property block (end at line 358), add `ram_memory_limit`: + +```json +, +"ram_memory_limit":{ + "type":["integer","null"], + "oneOf":[ + {"const":null, "title":"Same as request"}, + {"const":64, "title":"64 MB"}, + {"const":128, "title":"128 MB"}, + {"const":256, "title":"256 MB"}, + {"const":512, "title":"512 MB"}, + {"const":1024, "title":"1 GB"}, + {"const":2048, "title":"2 GB"}, + {"const":4096, "title":"4 GB"}, + {"const":8192, "title":"8 GB"}, + {"const":16384, "title":"16 GB"} + ], + "title":"RAM Memory Limit", + "default":null, + "minimum":{ + "$data":"1/ram_memory" + }, + "description":"Maximum memory the container can use (in MB). Setting this higher than the request increases the chance the scheduler kills the pod under pressure." +} +``` + +Do NOT add either field to the top-level `required` array — both stay optional. + +- [ ] **Step 2: Validate the JSON is still well-formed** + +Run: +```bash +jq empty k8s/specs/service-spec.json.tpl +``` +Expected: no output, exit code 0. + +If gomplate is available locally, also confirm the template renders to valid JSON: +```bash +NRN="nrn:test" gomplate -f k8s/specs/service-spec.json.tpl | jq empty +``` +Expected: no output, exit code 0. + +- [ ] **Step 3: Commit** + +```bash +git add k8s/specs/service-spec.json.tpl +git commit -m "feat: add cpu_millicores_limit and ram_memory_limit properties to k8s scope spec" +``` + +--- + +## Task 2: Rename "Processor" → "Resources" and add the limit Controls to the uiSchema + +**Files:** +- Modify: `k8s/specs/service-spec.json.tpl` (uiSchema `Category` block, lines 46–55) + +No automated test — uiSchema is rendered by the frontend. We validate by grep-based assertion in step 2 and visual smoke later. + +- [ ] **Step 1: Rename the Category label and add two Controls** + +Locate the `Category` whose label is `"Processor"` (line 47). Replace the whole block (lines 46–55) with: + +```json +{ + "type":"Category", + "label":"Resources", + "elements":[ + { + "type":"Control", + "label":"CPU Millicores", + "scope":"#/properties/cpu_millicores" + }, + { + "type":"Control", + "label":"CPU Millicores Limit", + "scope":"#/properties/cpu_millicores_limit" + }, + { + "type":"Control", + "label":"RAM Memory Limit", + "scope":"#/properties/ram_memory_limit" + } + ] +} +``` + +- [ ] **Step 2: Sanity-check the uiSchema is well-formed and has the expected shape** + +Run: +```bash +jq -e ' + .attributes.schema.uiSchema + | .. | objects | select(.label? == "Resources") + | .elements | map(.scope) as $scopes + | ($scopes | length) == 3 + and ($scopes | index("#/properties/cpu_millicores") != null) + and ($scopes | index("#/properties/cpu_millicores_limit") != null) + and ($scopes | index("#/properties/ram_memory_limit") != null) +' k8s/specs/service-spec.json.tpl >/dev/null && echo OK +``` +Expected: `OK`. + +Also confirm "Processor" is gone: +```bash +! grep -q '"Processor"' k8s/specs/service-spec.json.tpl && echo OK +``` +Expected: `OK`. + +- [ ] **Step 3: Commit** + +```bash +git add k8s/specs/service-spec.json.tpl +git commit -m "feat: rename Processor tab to Resources and surface CPU/RAM limit controls" +``` + +--- + +## Task 3: Add `normalize_capability_limits` to `build_context` (TDD) + +**Files:** +- Modify: `k8s/deployment/build_context` +- Modify: `k8s/deployment/tests/build_context.bats` + +This is the back-compat heart of the change. The function takes `$CONTEXT` (JSON) and fills `.scope.capabilities.cpu_millicores_limit` and `.scope.capabilities.ram_memory_limit` with the corresponding request value when the field is `null` or missing. Existing values pass through unchanged. + +- [ ] **Step 1: Write failing tests in `tests/build_context.bats`** + +Append at the end of `k8s/deployment/tests/build_context.bats`: + +```bash +# ============================================================================= +# normalize_capability_limits Function Tests (CLIEN-781) +# Fills in *_limit with the corresponding request value when null or missing, +# leaves explicit values untouched. +# ============================================================================= + +setup_normalize_limits_fn() { + eval "$(sed -n '/^normalize_capability_limits()/,/^}/p' "$PROJECT_ROOT/k8s/deployment/build_context")" +} + +@test "normalize_capability_limits: fills CPU limit from request when limit is absent" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024,"ram_memory_limit":2048}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" +} + +@test "normalize_capability_limits: fills RAM limit from request when limit is absent" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":700,"ram_memory":1024}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "normalize_capability_limits: fills both limits when both are absent" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "normalize_capability_limits: fills both limits when both are explicit null" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":null,"ram_memory":1024,"ram_memory_limit":null}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "normalize_capability_limits: preserves explicit non-null limits" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":2000,"ram_memory":1024,"ram_memory_limit":4096}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "2000" + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "4096" +} +``` + +- [ ] **Step 2: Run the new tests, confirm they fail** + +Run: +```bash +bats k8s/deployment/tests/build_context.bats -f normalize_capability_limits +``` +Expected: 5 failures, message about `normalize_capability_limits: command not found` (or similar — function does not exist yet). + +- [ ] **Step 3: Implement `normalize_capability_limits` in `build_context`** + +Open `k8s/deployment/build_context`. Above the `validate_status()` function (search for `^validate_status\(\)`), insert: + +```bash +# Fill in *_limit capability fields with the corresponding request value when +# the limit is missing or explicitly null. Idempotent. CLIEN-781. +normalize_capability_limits() { + echo "$1" | jq ' + .scope.capabilities.cpu_millicores_limit = (.scope.capabilities.cpu_millicores_limit // .scope.capabilities.cpu_millicores) + | .scope.capabilities.ram_memory_limit = (.scope.capabilities.ram_memory_limit // .scope.capabilities.ram_memory) + ' +} +``` + +Then wire it into the final context assembly. Find the block ending at line 314 (the big `jq '. + { ... }')` invocation around lines 285–314 that produces the final `$CONTEXT`). Immediately after that block (i.e., right before the `DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.deployment.id')` line at 316), add: + +```bash +CONTEXT=$(normalize_capability_limits "$CONTEXT") +``` + +- [ ] **Step 4: Run the new tests, confirm they pass** + +Run: +```bash +bats k8s/deployment/tests/build_context.bats -f normalize_capability_limits +``` +Expected: 5 tests pass. + +- [ ] **Step 5: Run the full build_context test suite to ensure no regressions** + +Run: +```bash +bats k8s/deployment/tests/build_context.bats +``` +Expected: all tests pass (baseline of this file is currently green per the existing CI; we are only adding tests). + +- [ ] **Step 6: Commit** + +```bash +git add k8s/deployment/build_context k8s/deployment/tests/build_context.bats +git commit -m "feat: normalize cpu/ram limit capabilities to request value when unset" +``` + +--- + +## Task 4: Render limits from normalized fields in the application container (TDD via template-shape test) + +**Files:** +- Create: `k8s/deployment/tests/deployment_template_shape.bats` +- Modify: `k8s/deployment/templates/deployment.yaml.tpl` (lines 313–319 only — the application container, NOT the sidecars) + +We assert the template shape with grep (same approach as `ingress_template_shape.bats`). End-to-end rendering through gomplate is exercised by the existing build pipeline; the shape test catches regressions like accidentally rebinding `limits.cpu` back to `cpu_millicores`. + +- [ ] **Step 1: Write the failing template-shape test** + +Create `k8s/deployment/tests/deployment_template_shape.bats`: + +```bash +#!/usr/bin/env bats +# ============================================================================= +# Structural tests for the deployment template. +# Verifies the application container's resources block uses the right +# capability for request vs limit. CLIEN-781. +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + export TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/deployment.yaml.tpl" +} + +# Slice the file from "name: application" to the next container header, +# isolating the application container's block from the sidecars (which keep +# using container_cpu_in_millicores / container_memory_in_memory). +app_container_block() { + awk ' + /^[[:space:]]+- name: application[[:space:]]*$/ { in_app=1 } + in_app { print } + /^[[:space:]]+terminationMessagePolicy:/ && in_app { exit } + ' "$TEMPLATE" +} + +@test "deployment template: application container limits.cpu uses cpu_millicores_limit" { + block=$(app_container_block) + echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores_limit[[:space:]]*\}\}m' >/dev/null +} + +@test "deployment template: application container limits.memory uses ram_memory_limit" { + block=$(app_container_block) + echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory_limit[[:space:]]*\}\}Mi' >/dev/null +} + +@test "deployment template: application container requests.cpu still uses cpu_millicores" { + block=$(app_container_block) + echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores[[:space:]]*\}\}m' >/dev/null +} + +@test "deployment template: application container requests.memory still uses ram_memory" { + block=$(app_container_block) + echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory[[:space:]]*\}\}Mi' >/dev/null +} + +@test "deployment template: sidecars still use container_cpu_in_millicores / container_memory_in_memory" { + # Sidecars are everything BEFORE the application container block. + before=$(awk '/^[[:space:]]+- name: application[[:space:]]*$/ {exit} {print}' "$TEMPLATE") + echo "$before" | grep -F '{{ .container_cpu_in_millicores }}m' >/dev/null + echo "$before" | grep -F '{{ .container_memory_in_memory }}Mi' >/dev/null + # And sidecars must NOT have been switched to the new fields. + ! echo "$before" | grep -F 'cpu_millicores_limit' >/dev/null + ! echo "$before" | grep -F 'ram_memory_limit' >/dev/null +} +``` + +- [ ] **Step 2: Run the new tests, confirm they fail** + +Run: +```bash +bats k8s/deployment/tests/deployment_template_shape.bats +``` +Expected: at least the first two tests fail (limits.cpu / limits.memory still pointing at `cpu_millicores` / `ram_memory` — request fields). + +- [ ] **Step 3: Edit the application container's resource block** + +Open `k8s/deployment/templates/deployment.yaml.tpl`. Locate lines 313–319 (the `- name: application` container's `resources` block). Replace those exact lines with: + +```yaml + resources: + limits: + cpu: {{ .scope.capabilities.cpu_millicores_limit }}m + memory: {{ .scope.capabilities.ram_memory_limit }}Mi + requests: + cpu: {{ .scope.capabilities.cpu_millicores }}m + memory: {{ .scope.capabilities.ram_memory }}Mi +``` + +Do NOT touch the sidecar `resources:` blocks at lines 148–153, 201–206, or 255–260. + +- [ ] **Step 4: Run the template-shape tests, confirm they pass** + +Run: +```bash +bats k8s/deployment/tests/deployment_template_shape.bats +``` +Expected: all 5 tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add k8s/deployment/templates/deployment.yaml.tpl k8s/deployment/tests/deployment_template_shape.bats +git commit -m "feat: render application container limits from normalized capability fields" +``` + +--- + +## Task 5: End-to-end smoke (manual) + +This is a sanity check, not a test — the project has no automated gomplate-render harness for `deployment.yaml.tpl`. Skip if `gomplate` is not installed locally. + +- [ ] **Step 1: Render the deployment template with a sample CONTEXT and inspect the output** + +```bash +cat > /tmp/clien781_ctx.json <<'JSON' +{ + "scope": { + "id": "scope-test", + "capabilities": { + "cpu_millicores": 500, + "cpu_millicores_limit": 1000, + "ram_memory": 1024, + "ram_memory_limit": 2048, + "health_check": {"enabled": true, "type": "HTTP", "path": "/health", "initial_delay_seconds": 10}, + "additional_ports": [] + } + }, + "deployment": {"id": "deploy-test"}, + "asset": {"url": "example.com/app:1.0"}, + "container_cpu_in_millicores": "93", + "container_memory_in_memory": "64", + "main_http_port": 8080, + "traffic_image": "example.com/traffic:1.0", + "blue_replicas": "0", + "green_replicas": "1", + "total_replicas": "1", + "blue_deployment_id": "", + "pull_secrets": [], + "pdb_enabled": "false", + "pdb_max_unavailable": "1", + "service_account_name": "default", + "traffic_manager_config_map": "tm-config", + "blue_additional_port_services": {} +} +JSON + +gomplate -c .=/tmp/clien781_ctx.json -f k8s/deployment/templates/deployment.yaml.tpl \ + | grep -A4 'name: application' \ + | grep -A3 'resources:' \ + | sed -n '1,8p' +``` + +Expected output should include: +``` + resources: + limits: + cpu: 1000m + memory: 2048Mi + requests: + cpu: 500m + memory: 1024Mi +``` + +- [ ] **Step 2: Render again with the limit fields omitted (back-compat case)** + +Edit `/tmp/clien781_ctx.json` and remove `cpu_millicores_limit` and `ram_memory_limit`. Then re-run the same `gomplate ... | grep` chain. + +**Wait** — gomplate will error on missing keys. This step illustrates that the back-compat path MUST go through `build_context` (which normalizes), not raw template rendering. The build pipeline always runs `build_context` first, so in production this is fine. The manual smoke here just confirms that the normalized context produces the right output; the "missing keys" path is covered by the BATS tests in Task 3. + +- [ ] **Step 3: Clean up** + +```bash +rm /tmp/clien781_ctx.json +``` + +--- + +## Task 6: Run the full k8s test suite and push the branch + +- [ ] **Step 1: Run all k8s BATS tests in batches** (per the project memory rule about avoiding BATS temp-dir collisions) + +Run: +```bash +bats k8s/deployment/tests/build_context.bats +bats k8s/deployment/tests/build_deployment.bats +bats k8s/deployment/tests/deployment_template_shape.bats +bats k8s/deployment/tests/ingress_template_shape.bats +bats k8s/deployment/tests/apply_templates.bats +``` +Expected: all green. + +- [ ] **Step 2: Confirm git status is clean and on the right branch** + +Run: +```bash +git status +git log --oneline beta..HEAD +``` +Expected: clean tree; four feature commits (Tasks 1–4) on top of beta. + +- [ ] **Step 3: Push the branch** + +Run: +```bash +git push -u origin feature/clien-781-memory-cpu-limits +``` + +- [ ] **Step 4: Run the quality-gate skill before opening a PR** + +Per the user's global `CLAUDE.md`, run `quality-gate` after non-trivial coding tasks and before claiming work is done. The skill orchestrates code-review, security audit, and simplification checks. + +--- + +## Out of scope (for follow-up tickets) + +- Docsite documentation for the new capabilities. +- CLI / OpenAPI changes — none required, the capability schema is consumed dynamically. +- Symmetric treatment for other resource dimensions (ephemeral storage, GPUs). +- Sidecar resource overrides — sidecars keep using `container_cpu_in_millicores` / `container_memory_in_memory` from the ConfigMap. + +--- + +## Self-review checklist (done by plan author) + +- [x] **Spec coverage:** every section of the spec (schema, uiSchema, render, back-compat, validation, testing) maps to a task. +- [x] **No placeholders:** every step has concrete code, paths, and expected output. +- [x] **Type consistency:** `normalize_capability_limits` is referenced consistently; field names match the schema (`cpu_millicores_limit`, `ram_memory_limit`). +- [x] **Scope:** single coherent change, one branch, one PR. From 3916ce83b77c4aadb316f6b9335812d90cad846d Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:15:54 -0300 Subject: [PATCH 33/56] feat: add cpu_millicores_limit and ram_memory_limit properties to k8s scope spec --- k8s/specs/service-spec.json.tpl | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index 656e641d..599de58d 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -356,6 +356,27 @@ "default":128, "description":"Amount of RAM memory to allocate to the container (in MB)" }, + "ram_memory_limit":{ + "type":["integer","null"], + "oneOf":[ + {"const":null, "title":"Same as request"}, + {"const":64, "title":"64 MB"}, + {"const":128, "title":"128 MB"}, + {"const":256, "title":"256 MB"}, + {"const":512, "title":"512 MB"}, + {"const":1024, "title":"1 GB"}, + {"const":2048, "title":"2 GB"}, + {"const":4096, "title":"4 GB"}, + {"const":8192, "title":"8 GB"}, + {"const":16384, "title":"16 GB"} + ], + "title":"RAM Memory Limit", + "default":null, + "minimum":{ + "$data":"1/ram_memory" + }, + "description":"Maximum memory the container can use (in MB). Setting this higher than the request increases the chance the scheduler kills the pod under pressure." + }, "visibility":{ "type":"string", "oneOf":[ @@ -490,6 +511,16 @@ "minimum":100, "description":"Amount of CPU to allocate (in millicores, 1000m = 1 CPU core)" }, + "cpu_millicores_limit":{ + "type":["integer","null"], + "title":"CPU Millicores Limit", + "default":null, + "maximum":4000, + "minimum":{ + "$data":"1/cpu_millicores" + }, + "description":"Maximum CPU the container can use (in millicores). Leave empty to use the same value as the request." + }, "scheduled_stop":{ "type":"object", "title":"Scheduled Stop", From 957debcbe4ef92ef256e43cef49947aa1ea52ce9 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:16:25 -0300 Subject: [PATCH 34/56] feat: rename Processor tab to Resources and surface CPU/RAM limit controls --- k8s/specs/service-spec.json.tpl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index 599de58d..e75a7146 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -44,12 +44,22 @@ "elements":[ { "type":"Category", - "label":"Processor", + "label":"Resources", "elements":[ { "type":"Control", "label":"CPU Millicores", "scope":"#/properties/cpu_millicores" + }, + { + "type":"Control", + "label":"CPU Millicores Limit", + "scope":"#/properties/cpu_millicores_limit" + }, + { + "type":"Control", + "label":"RAM Memory Limit", + "scope":"#/properties/ram_memory_limit" } ] }, From 8bc5dac48372127751670d1f59367d00fdc11209 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:18:57 -0300 Subject: [PATCH 35/56] feat: normalize cpu/ram limit capabilities to request value when unset --- k8s/deployment/build_context | 11 +++++ k8s/deployment/tests/build_context.bats | 53 +++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/k8s/deployment/build_context b/k8s/deployment/build_context index 1f357deb..638cd5e6 100755 --- a/k8s/deployment/build_context +++ b/k8s/deployment/build_context @@ -23,6 +23,15 @@ MIN_REPLICAS=$(echo "$MIN_REPLICAS" | awk '{printf "%d", ($1 == int($1) ? $1 : i DEPLOYMENT_STATUS=$(echo "$CONTEXT" | jq -r ".deployment.status") +# Fill in *_limit capability fields with the corresponding request value when +# the limit is missing or explicitly null. Idempotent. CLIEN-781. +normalize_capability_limits() { + echo "$1" | jq ' + .scope.capabilities.cpu_millicores_limit = (.scope.capabilities.cpu_millicores_limit // .scope.capabilities.cpu_millicores) + | .scope.capabilities.ram_memory_limit = (.scope.capabilities.ram_memory_limit // .scope.capabilities.ram_memory) + ' +} + validate_status() { local action="$1" local status="$2" @@ -313,6 +322,8 @@ CONTEXT=$(echo "$CONTEXT" | jq \ main_http_port: ($main_http_port | tonumber) }') +CONTEXT=$(normalize_capability_limits "$CONTEXT") + DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.deployment.id') OUTPUT_DIR="$SERVICE_PATH/output/$SCOPE_ID-$DEPLOYMENT_ID" diff --git a/k8s/deployment/tests/build_context.bats b/k8s/deployment/tests/build_context.bats index 690a8ab4..020c2e84 100644 --- a/k8s/deployment/tests/build_context.bats +++ b/k8s/deployment/tests/build_context.bats @@ -946,3 +946,56 @@ set_additional_ports() { assert_equal "$(echo "$CONTEXT" | jq -c '.scope.capabilities.additional_ports')" "[]" } + +# ============================================================================= +# normalize_capability_limits Function Tests (CLIEN-781) +# Fills in *_limit with the corresponding request value when null or missing, +# leaves explicit values untouched. +# ============================================================================= + +setup_normalize_limits_fn() { + eval "$(sed -n '/^normalize_capability_limits()/,/^}/p' "$PROJECT_ROOT/k8s/deployment/build_context")" +} + +@test "normalize_capability_limits: fills CPU limit from request when limit is absent" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024,"ram_memory_limit":2048}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" +} + +@test "normalize_capability_limits: fills RAM limit from request when limit is absent" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":700,"ram_memory":1024}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "normalize_capability_limits: fills both limits when both are absent" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "normalize_capability_limits: fills both limits when both are explicit null" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":null,"ram_memory":1024,"ram_memory_limit":null}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "normalize_capability_limits: preserves explicit non-null limits" { + setup_normalize_limits_fn + local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":2000,"ram_memory":1024,"ram_memory_limit":4096}}}' + local out + out=$(normalize_capability_limits "$in") + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "2000" + assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "4096" +} From f50b59ca7851c36336a2c6b65bf927c4fc377c95 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:19:46 -0300 Subject: [PATCH 36/56] feat: render application container limits from normalized capability fields --- k8s/deployment/templates/deployment.yaml.tpl | 4 +- .../tests/deployment_template_shape.bats | 53 +++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 k8s/deployment/tests/deployment_template_shape.bats diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 3552c483..44f58d77 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -312,8 +312,8 @@ spec: {{ end }} resources: limits: - cpu: {{ .scope.capabilities.cpu_millicores }}m - memory: {{ .scope.capabilities.ram_memory }}Mi + cpu: {{ .scope.capabilities.cpu_millicores_limit }}m + memory: {{ .scope.capabilities.ram_memory_limit }}Mi requests: cpu: {{ .scope.capabilities.cpu_millicores }}m memory: {{ .scope.capabilities.ram_memory }}Mi diff --git a/k8s/deployment/tests/deployment_template_shape.bats b/k8s/deployment/tests/deployment_template_shape.bats new file mode 100644 index 00000000..f80db60f --- /dev/null +++ b/k8s/deployment/tests/deployment_template_shape.bats @@ -0,0 +1,53 @@ +#!/usr/bin/env bats +# ============================================================================= +# Structural tests for the deployment template. +# Verifies the application container's resources block uses the right +# capability for request vs limit. CLIEN-781. +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + export TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/deployment.yaml.tpl" +} + +# Slice the file from "name: application" to the next container header, +# isolating the application container's block from the sidecars (which keep +# using container_cpu_in_millicores / container_memory_in_memory). +app_container_block() { + awk ' + /^[[:space:]]+- name: application[[:space:]]*$/ { in_app=1 } + in_app { print } + /^[[:space:]]+terminationMessagePolicy:/ && in_app { exit } + ' "$TEMPLATE" +} + +@test "deployment template: application container limits.cpu uses cpu_millicores_limit" { + block=$(app_container_block) + echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores_limit[[:space:]]*\}\}m' >/dev/null +} + +@test "deployment template: application container limits.memory uses ram_memory_limit" { + block=$(app_container_block) + echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory_limit[[:space:]]*\}\}Mi' >/dev/null +} + +@test "deployment template: application container requests.cpu still uses cpu_millicores" { + block=$(app_container_block) + echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores[[:space:]]*\}\}m' >/dev/null +} + +@test "deployment template: application container requests.memory still uses ram_memory" { + block=$(app_container_block) + echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory[[:space:]]*\}\}Mi' >/dev/null +} + +@test "deployment template: sidecars still use container_cpu_in_millicores / container_memory_in_memory" { + # Sidecars are everything BEFORE the application container block. + before=$(awk '/^[[:space:]]+- name: application[[:space:]]*$/ {exit} {print}' "$TEMPLATE") + echo "$before" | grep -F '{{ .container_cpu_in_millicores }}m' >/dev/null + echo "$before" | grep -F '{{ .container_memory_in_memory }}Mi' >/dev/null + # And sidecars must NOT have been switched to the new fields. + ! echo "$before" | grep -F 'cpu_millicores_limit' >/dev/null + ! echo "$before" | grep -F 'ram_memory_limit' >/dev/null +} From a40f54ab3c40d0769ed96625b9884df565333ddc Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:26:30 -0300 Subject: [PATCH 37/56] refactor: tighten normalize_capability_limits jq + bats here-string idioms --- k8s/deployment/build_context | 8 ++--- .../tests/deployment_template_shape.bats | 35 ++++++++++--------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/k8s/deployment/build_context b/k8s/deployment/build_context index 638cd5e6..c89c7928 100755 --- a/k8s/deployment/build_context +++ b/k8s/deployment/build_context @@ -26,10 +26,10 @@ DEPLOYMENT_STATUS=$(echo "$CONTEXT" | jq -r ".deployment.status") # Fill in *_limit capability fields with the corresponding request value when # the limit is missing or explicitly null. Idempotent. CLIEN-781. normalize_capability_limits() { - echo "$1" | jq ' - .scope.capabilities.cpu_millicores_limit = (.scope.capabilities.cpu_millicores_limit // .scope.capabilities.cpu_millicores) - | .scope.capabilities.ram_memory_limit = (.scope.capabilities.ram_memory_limit // .scope.capabilities.ram_memory) - ' + echo "$1" | jq ' + .scope.capabilities.cpu_millicores_limit //= .scope.capabilities.cpu_millicores + | .scope.capabilities.ram_memory_limit //= .scope.capabilities.ram_memory + ' } validate_status() { diff --git a/k8s/deployment/tests/deployment_template_shape.bats b/k8s/deployment/tests/deployment_template_shape.bats index f80db60f..6f44ffba 100644 --- a/k8s/deployment/tests/deployment_template_shape.bats +++ b/k8s/deployment/tests/deployment_template_shape.bats @@ -11,9 +11,9 @@ setup() { export TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/deployment.yaml.tpl" } -# Slice the file from "name: application" to the next container header, -# isolating the application container's block from the sidecars (which keep -# using container_cpu_in_millicores / container_memory_in_memory). +# Slice the file from "name: application" up to the application container's +# terminationMessagePolicy, isolating it from the sidecars (which keep using +# container_cpu_in_millicores / container_memory_in_memory). app_container_block() { awk ' /^[[:space:]]+- name: application[[:space:]]*$/ { in_app=1 } @@ -22,32 +22,33 @@ app_container_block() { ' "$TEMPLATE" } +# Everything BEFORE the application container — the sidecar definitions. +sidecars_block() { + awk '/^[[:space:]]+- name: application[[:space:]]*$/ {exit} {print}' "$TEMPLATE" +} + @test "deployment template: application container limits.cpu uses cpu_millicores_limit" { - block=$(app_container_block) - echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores_limit[[:space:]]*\}\}m' >/dev/null + grep -qE 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores_limit[[:space:]]*\}\}m' <<<"$(app_container_block)" } @test "deployment template: application container limits.memory uses ram_memory_limit" { - block=$(app_container_block) - echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory_limit[[:space:]]*\}\}Mi' >/dev/null + grep -qE 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory_limit[[:space:]]*\}\}Mi' <<<"$(app_container_block)" } @test "deployment template: application container requests.cpu still uses cpu_millicores" { - block=$(app_container_block) - echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores[[:space:]]*\}\}m' >/dev/null + grep -qE 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores[[:space:]]*\}\}m' <<<"$(app_container_block)" } @test "deployment template: application container requests.memory still uses ram_memory" { - block=$(app_container_block) - echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory[[:space:]]*\}\}Mi' >/dev/null + grep -qE 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory[[:space:]]*\}\}Mi' <<<"$(app_container_block)" } @test "deployment template: sidecars still use container_cpu_in_millicores / container_memory_in_memory" { - # Sidecars are everything BEFORE the application container block. - before=$(awk '/^[[:space:]]+- name: application[[:space:]]*$/ {exit} {print}' "$TEMPLATE") - echo "$before" | grep -F '{{ .container_cpu_in_millicores }}m' >/dev/null - echo "$before" | grep -F '{{ .container_memory_in_memory }}Mi' >/dev/null + local sidecars + sidecars=$(sidecars_block) + grep -qF '{{ .container_cpu_in_millicores }}m' <<<"$sidecars" + grep -qF '{{ .container_memory_in_memory }}Mi' <<<"$sidecars" # And sidecars must NOT have been switched to the new fields. - ! echo "$before" | grep -F 'cpu_millicores_limit' >/dev/null - ! echo "$before" | grep -F 'ram_memory_limit' >/dev/null + ! grep -qF 'cpu_millicores_limit' <<<"$sidecars" + ! grep -qF 'ram_memory_limit' <<<"$sidecars" } From 3eff675c103a3132ee632810c2a4916d4a3c75b9 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 13:28:51 -0300 Subject: [PATCH 38/56] fix: mark cpu_millicores_limit and ram_memory_limit as required for UI visibility --- .../specs/2026-05-21-clien-781-memory-cpu-limits-design.md | 2 +- k8s/specs/service-spec.json.tpl | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md b/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md index c5cbaa0c..6aef352f 100644 --- a/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md +++ b/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md @@ -88,7 +88,7 @@ Asymmetry between RAM and CPU is intentional: RAM request stays in the main form } ``` -Neither property is added to the `required` array of `attributes.schema` — both are optional. +Both properties are added to the `required` array of `attributes.schema`. This is the nullplatform UI's contract: the frontend only renders properties that appear in `required` (established during CLIEN-739). Defaults of `null` keep this non-breaking — existing scopes materialize the default, and `normalize_capability_limits` collapses `null` back to the request value before the deployment template renders. ### uiSchema changes diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index e75a7146..f9742bcf 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -5,11 +5,13 @@ "type":"object", "required":[ "ram_memory", + "ram_memory_limit", "visibility", "autoscaling", "health_check", "scaling_type", "cpu_millicores", + "cpu_millicores_limit", "fixed_instances", "scheduled_stop", "additional_ports", From 6856c9cbfc807453ea602656e6bdffe517629e1b Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Thu, 21 May 2026 14:35:18 -0300 Subject: [PATCH 39/56] refactor: make cpu_millicores_limit a dropdown with 'Same as request' option --- .../2026-05-21-clien-781-memory-cpu-limits-design.md | 12 ++++++++++-- k8s/deployment/tests/build_context.bats | 2 +- k8s/specs/service-spec.json.tpl | 12 ++++++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md b/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md index 6aef352f..24741172 100644 --- a/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md +++ b/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md @@ -63,9 +63,17 @@ Asymmetry between RAM and CPU is intentional: RAM request stays in the main form "type": ["integer", "null"], "title": "CPU Millicores Limit", "default": null, - "maximum": 4000, + "oneOf": [ + { "const": null, "title": "Same as request" }, + { "const": 100, "title": "100 m" }, + { "const": 250, "title": "250 m" }, + { "const": 500, "title": "500 m" }, + { "const": 1000, "title": "1000 m" }, + { "const": 2000, "title": "2000 m" }, + { "const": 4000, "title": "4000 m" } + ], "minimum": { "$data": "1/cpu_millicores" }, - "description": "Maximum CPU the container can use. Leave empty to use the same value as the request." + "description": "Maximum CPU the container can use (in millicores). Pick 'Same as request' to leave it equal to the request value." }, "ram_memory_limit": { "type": ["integer", "null"], diff --git a/k8s/deployment/tests/build_context.bats b/k8s/deployment/tests/build_context.bats index 020c2e84..0f035d6c 100644 --- a/k8s/deployment/tests/build_context.bats +++ b/k8s/deployment/tests/build_context.bats @@ -967,7 +967,7 @@ setup_normalize_limits_fn() { @test "normalize_capability_limits: fills RAM limit from request when limit is absent" { setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":700,"ram_memory":1024}}}' + local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":1000,"ram_memory":1024}}}' local out out=$(normalize_capability_limits "$in") assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index f9742bcf..3032936f 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -525,13 +525,21 @@ }, "cpu_millicores_limit":{ "type":["integer","null"], + "oneOf":[ + {"const":null, "title":"Same as request"}, + {"const":100, "title":"100 m"}, + {"const":250, "title":"250 m"}, + {"const":500, "title":"500 m"}, + {"const":1000, "title":"1000 m"}, + {"const":2000, "title":"2000 m"}, + {"const":4000, "title":"4000 m"} + ], "title":"CPU Millicores Limit", "default":null, - "maximum":4000, "minimum":{ "$data":"1/cpu_millicores" }, - "description":"Maximum CPU the container can use (in millicores). Leave empty to use the same value as the request." + "description":"Maximum CPU the container can use (in millicores). Pick 'Same as request' to leave it equal to the request value." }, "scheduled_stop":{ "type":"object", From fcf31d7ef0607fe84f375dfcf90756c8c190ffe6 Mon Sep 17 00:00:00 2001 From: Ignacio Boudgouste <73672747+ignacioboud@users.noreply.github.com> Date: Fri, 22 May 2026 11:21:30 -0300 Subject: [PATCH 40/56] Structured evidence + log embedding in k8s/diagnose (#181) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Capture deployments, replicasets, pod logs and describe in diagnose snapshot Extends build_context to capture the resources needed for a complete post-mortem: - deployments.json and replicasets.json scoped by deployment_id (so we see rollout state even when no pods got created) - For every pod identified as problematic (CrashLoop / OOM / ImagePullBackOff / Terminated / restartCount>0 / not-Ready / terminating), capture: - kubectl describe pod -> data/pod_describe/.txt - kubectl logs (current + --previous) for every container, including init containers, into data/pod_logs/.[.previous].log Tail size is configurable via POD_LOG_TAIL_LINES (default 500). All new files live under data/, so notify_results continues to exclude them from the backend payload. The data is consumed by downstream checks and (in a follow-up) embedded into evidence for AI consumption. Co-Authored-By: Claude Opus 4.7 (1M context) * Emit structured evidence in all diagnose checks Until now every check emitted update_check_result with an empty {} evidence payload, leaving only printf'd stdout for downstream consumers (UI / AI). With 20 checks all producing colored ANSI text, neither a frontend nor an LLM could reliably extract counts, names, exit codes, or remediation steps. This change defines a canonical evidence schema in diagnose_utils: { summary: "one-line human summary", severity: "critical" | "warning" | "info", affected: ["resource-names"], details: { check-specific structured data }, suggested_actions: ["actionable guidance"] } Helpers: - evidence_json(summary, severity, affected, details, actions): builds the schema with safe defaults - exit_code_meaning(code): maps 0/1/137/139/143 → human-readable, reused across crash, OOM, and termination checks - require_resources updated so the "skipped" path also emits schema evidence All 20 checks migrated. Each preserves its existing stdout output (so no regressions for users tailing logs) and additionally builds details with the data already extracted: pod names, container names, exit codes, restart counts, endpoint counts, ingress backends, certificate ARNs, etc. Severity is mapped from status (failed→critical, warning→warning, success/skipped→info), allowing the AI summarizer to prioritize what matters. Side effects: - Fixes a pre-existing bug in ingress_tls_configuration that read tls.crt from .metadata.annotations | keys[] (which never contains them, and where build_context strips .data anyway). Now relies on Secret type validation. - Adds tests/evidence_schema.bats: cross-cutting validation that every check in scope/, service/, and networking/ emits a schema-conformant payload on skipped, failed, and success paths. - Updates existing test files where they previously asserted on legacy flat evidence fields (.evidence.tested, .evidence.ready) to point at the new nested location (.evidence.details.*). Suite: 280 tests, 0 failures. Co-Authored-By: Claude Opus 4.7 (1M context) * Embed pod logs in failed-check evidence for AI post-mortem Without logs in the evidence payload, the AI summarizer would have to fetch them separately for every diagnose run. By the time the summary is requested, the cluster state has already moved on (rollback fired, pods churned), so live logs would be misleading. Instead, embed the relevant log slice from the build_context snapshot directly into the failing check's evidence — the AI gets self-contained post-mortem in a single payload. Helper: read_log_tail(pod, container, "current"|"previous", [lines]) reads from data/pod_logs/ and returns a JSON array of lines. Returns [] when the file is missing (most common case: no previous log because container never crashed). Truncation is configurable via EVIDENCE_LOG_TAIL_LINES (default 50, intentionally smaller than the 500-line build_context capture so the payload stays bounded). Five checks now embed logs on their failure paths: - container_crash_detection: previous_logs (CrashLoopBackOff, high-restart) and current_logs + previous_logs (terminated). Previous is where the crash output lives — current is empty during the restart loop. - memory_limits_check: previous_logs on OOMKilled. The kubelet restarts the container after the kill, so OOM-relevant output is in the previous instance. - health_probe_endpoints: container_logs (current) on every probe failure (4xx, 5xx, connection refused). Pairs the probe verdict with what the app was printing. - container_port_health: container_logs (current) on port_not_listening issues. Container is running but not bound — current logs typically show why (binding error, config mismatch). - pod_readiness: current_logs of the first container for stuck (not_ready) pods, but NOT for normally-starting pods (avoids noise during rollouts). Discriminations made deliberately: - Success paths never embed logs (keeps payload light for healthy scopes). - image_pull_status doesn't embed: if the image couldn't be pulled, there is no container and no logs. - networking/ and service/ checks don't embed: their failures are configuration issues, not application issues. Tests: +8 covering the helper, the embedding behavior, and a regression test asserting the success path stays log-free. Suite: 288 tests, 0 failures (10 environmental skips on macOS dev hosts where nc/timeout from coreutils aren't in PATH). Co-Authored-By: Claude Opus 4.7 (1M context) * Dedup mark_affected and replace jq-loop accumulators with bash arrays The first pass landed evidence enrichment quickly but at the cost of two duplications visible across all 17 non-existence checks: - mark_affected was redefined locally in every check (18 copies of the same 3-line jq-dedup function, differing only by the array variable name). - Each check accumulated facts via the same per-iteration jq round-trip pattern: FACTS=$(echo "$FACTS" | jq --argjson f "$x" '. + [$f]'). This is O(N²) (the JSON array is reparsed and reserialized on every push) and forks one jq process per iteration. ~60 such call sites; for a failing scope with 10 problematic pods, that's hundreds of jq forks per check. This commit moves both into diagnose_utils: - mark_affected — adds to a space-separated set stored in a bash variable, dedup on add (no jq). - set_to_json_array — converts the set to a JSON array in a single jq call. - add_fact — bash array append, no jq. - facts_to_json_array — converts the array to a JSON array in a single jq -s call at the end of accumulation. - lines_to_json_array — extracted shared filter for the tail|jq -R -s 'split("\n")...' pipeline that update_check_result and read_log_tail both used. All 17 affected checks were migrated. The 18 local mark_affected copies are gone; check-level accumulator code shrunk from "jq-merge per iteration" to "bash append per iteration, jq once at end". Bash 3.2 compatibility: helpers use eval-based pass-by-name rather than declare -n / declare -A (which require bash 4.3+ / 4.0+). Production runtime on Alpine has bash 5.x, but local dev tests on macOS run /bin/bash 3.2. Suite: 288 tests, 0 failures, 10 environmental skips. No behavior change — both the human stdout and the evidence JSON shape are byte-identical to the pre-simplify baseline. Co-Authored-By: Claude Opus 4.7 (1M context) * Add Application Logs diagnose category for AI post-mortem New diagnose category that publishes the application's own log output as structured evidence, contextualized with the pod and container state at fail-time. Unlike the scope/ checks that detect specific failure modes and embed logs as secondary evidence, this check is log-first: a single, self-contained category the AI summarizer can read to say "here is the issue, look at this" without cross-referencing other checks. - k8s/diagnose/logs/workflow.yml: declares the category "Application Logs" with a single step that runs application_log_evidence. - k8s/diagnose/logs/application_log_evidence: iterates problematic pods from the build_context snapshot (no live kubectl), reads current and previous logs per container (init + regular) via read_log_tail, and emits a fact per container with the schema: { pod, pod_phase, pod_reason, container, init_container, container_state, restart_count, current_state_reason, last_termination_reason, last_exit_code, last_exit_code_meaning, current_logs, previous_logs } Status is always success/skipped (info severity). The check never fails: absence of logs is itself meaningful information ("image never started"). Reuses exit_code_meaning from diagnose_utils for the meaning string. - k8s/scope/workflows/diagnose.yaml and k8s/deployment/workflows/diagnose.yaml: register the new folder in the executor so the category appears automatically alongside Scope/Service/ Networking. notify_results groups by category, no backend changes required. - k8s/diagnose/tests/logs/application_log_evidence.bats: 10 tests covering skipped path, empty problematic list, current logs only, previous logs, init container flag, no-logs-available, multi-pod aggregation, empty log files, CrashLoopBackOff context, and pod_reason from Ready condition. - k8s/diagnose/tests/evidence_schema.bats: +1 cross-cutting test asserting the check emits a schema-conformant evidence object on the skipped path. Full suite: 357/357 green. Co-Authored-By: Claude Opus 4.7 (1M context) * Focus application_log_evidence on the 'application' container only Narrow the Application Logs check to its essential job: publish the user-owned container's logs for AI post-mortem. The previous shape duplicated metadata already emitted by the scope/ checks (pod_phase, container_state, restart counts, exit codes, etc.) and iterated every container in every problematic pod — including sidecars like 'http' nginx whose logs already appear in Health Probe Endpoints and Container Port Health. - Filters by container name 'application' (the literal name set in k8s/deployment/templates/deployment.yaml.tpl). Sidecars and init containers are out of scope; this check is not a per-container audit. - Per-pod payload shrinks from 12 fields to 2: { pod, logs }. - current and previous logs are merged in chronological order (previous first, current second) and truncated to the last EVIDENCE_LOG_TAIL_LINES (default 50). One flat array — the AI does not need to know which container instance produced which line; the user wanted the tail of the application output, period. - Tests updated: 9 cases covering skipped/empty paths, application-only filtering (asserts sidecar logs do not leak), previous+current merge in order, the 50-line cap, multi-pod aggregation, and a schema-pinning test that asserts the pod entry exposes exactly {pod, logs}. jq gotcha worth noting: `.[-n:]` with `n` as a variable does not compile ("n/0 is not defined") because jq parses `-n` as expression-minus-function. The correct slice is `.[-$n:]` with the `$` prefix. Full diagnose suite: 356/356 green. Co-Authored-By: Claude Opus 4.7 (1M context) * Move application logs out of evidence, keep them only in check stdout Previously the application log text was duplicated: it lived inside evidence.details.pods[].logs (canonical for AI) and was also echoed to stdout (so the UI's check.logs[] tail could show it). For a single-pod scope that meant the same ~45 lines appearing twice in the result payload. Consolidate to a single source: the check.logs[] tail. evidence.details now carries only counters (pods_with_logs, problematic_pod_count) and the list of pods that produced logs is published via evidence.affected. No log text in evidence at all. The trade-off is the existing 20-line cap inside update_check_result — the UI sees the last 20 non-empty stdout lines of the check, which means roughly the last 17-18 log lines plus the check's own diagnostic prints. Sufficient for the typical single-pod scope; if that proves too tight, we can revisit the cap in diagnose_utils. Tests reshaped: 9 cases covering skipped/empty paths, sidecar exclusion in stdout, evidence.details exposing exactly {pods_with_logs, problematic_pod_count} (anchor against log text leaking back in), chronological merge of previous before current, the 50-line cap on the echoed tail, and multi-pod aggregation via affected[]. Full diagnose suite: 356/356 green. Co-Authored-By: Claude Opus 4.7 (1M context) * Allow checks to override the 20-line cap on captured stdout The Application Log Evidence check echoes the application log tail to stdout so the diagnose UI can show it in check.logs[]. But the existing 20-line cap inside update_check_result chops most of the payload off — for a typical single-pod scope with 50 log lines plus a few diagnostic prints, only ~17 log lines survive in the UI. Add an opt-in --log-tail-lines flag on update_check_result. Default stays at 20 (no impact on the other 19 checks). The logs check passes --log-tail-lines 200, which fits a few pods worth of output plus the check's own orchestrator/info lines. - diagnose_utils: parse --log-tail-lines, use it in the tail call; preserve the positional and --status/--evidence APIs unchanged. - logs/application_log_evidence: pass --log-tail-lines 200 to every update_check_result invocation on a path that emits log text. The skipped path keeps the default 20. - diagnose_utils.bats: rename existing test to "by default" and add two new cases: an 80-line override over 100 input lines, and a 5-line cap preserving the most recent lines. Full diagnose suite: 358/358 green. Co-Authored-By: Claude Opus 4.7 (1M context) * chore(changelog): note structured evidence and Application Logs in k8s/diagnose Co-Authored-By: Claude Opus 4.7 (1M context) * chore(changelog): drop "AI post-mortem" framing from diagnose entry Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + k8s/deployment/workflows/diagnose.yaml | 3 +- k8s/diagnose/build_context | 87 +++- k8s/diagnose/logs/application_log_evidence | 114 +++++ k8s/diagnose/logs/workflow.yml | 6 + k8s/diagnose/networking/alb_capacity_check | 194 +++++--- .../networking/ingress_backend_service | 144 ++++-- .../networking/ingress_class_validation | 58 ++- .../networking/ingress_controller_sync | 95 +++- k8s/diagnose/networking/ingress_existence | 25 +- k8s/diagnose/networking/ingress_host_rules | 161 ++++--- .../networking/ingress_tls_configuration | 130 +++-- k8s/diagnose/scope/container_crash_detection | 206 ++++++-- k8s/diagnose/scope/container_port_health | 111 ++++- k8s/diagnose/scope/health_probe_endpoints | 434 +++++++---------- k8s/diagnose/scope/image_pull_status | 47 +- k8s/diagnose/scope/memory_limits_check | 53 +- k8s/diagnose/scope/pod_existence | 22 +- k8s/diagnose/scope/pod_readiness | 116 +++-- k8s/diagnose/scope/resource_availability | 67 ++- k8s/diagnose/scope/storage_mounting | 51 +- k8s/diagnose/service/service_endpoints | 115 +++-- k8s/diagnose/service/service_existence | 22 +- .../service/service_port_configuration | 90 ++-- k8s/diagnose/service/service_selector_match | 93 ++-- k8s/diagnose/service/service_type_validation | 66 ++- k8s/diagnose/tests/build_context.bats | 274 ++++++++++- k8s/diagnose/tests/diagnose_utils.bats | 124 ++++- k8s/diagnose/tests/evidence_schema.bats | 453 ++++++++++++++++++ .../tests/logs/application_log_evidence.bats | 242 ++++++++++ .../scope/container_crash_detection.bats | 100 ++++ .../tests/scope/container_port_health.bats | 6 +- .../tests/scope/health_probe_endpoints.bats | 4 +- k8s/diagnose/tests/scope/pod_readiness.bats | 4 +- k8s/diagnose/utils/diagnose_utils | 194 +++++++- k8s/scope/workflows/diagnose.yaml | 3 +- 36 files changed, 3106 insertions(+), 809 deletions(-) create mode 100644 k8s/diagnose/logs/application_log_evidence create mode 100644 k8s/diagnose/logs/workflow.yml create mode 100644 k8s/diagnose/tests/evidence_schema.bats create mode 100644 k8s/diagnose/tests/logs/application_log_evidence.bats diff --git a/CHANGELOG.md b/CHANGELOG.md index 14165264..fb8f2bc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage - Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports - Improve **wait deployment active** failure logging: consolidate repeated `Unhealthy` probe events per pod into a single human-readable line, emit a progress heartbeat every 10% of timeout, and surface a targeted suggested fix based on the probe failure mode (port not open / HTTP non-2xx / probe timeout) +- Improve **k8s/diagnose** evidence: every check now emits structured evidence following a documented schema (`summary`, `severity`, `affected`, `details`, `suggested_actions`), failure findings embed the relevant pod log slice (current or previous depending on the failure mode), and a new **Application Logs** category surfaces the user-owned `application` container's log tail directly in the UI ## [1.11.0] - 2026-04-16 - Add unit testing support diff --git a/k8s/deployment/workflows/diagnose.yaml b/k8s/deployment/workflows/diagnose.yaml index 66223726..45d837c3 100644 --- a/k8s/deployment/workflows/diagnose.yaml +++ b/k8s/deployment/workflows/diagnose.yaml @@ -34,4 +34,5 @@ steps: folders: - "$SERVICE_PATH/diagnose/service" - "$SERVICE_PATH/diagnose/scope" - - "$SERVICE_PATH/diagnose/networking" \ No newline at end of file + - "$SERVICE_PATH/diagnose/networking" + - "$SERVICE_PATH/diagnose/logs" \ No newline at end of file diff --git a/k8s/diagnose/build_context b/k8s/diagnose/build_context index 8ec7e8dc..1459cca4 100755 --- a/k8s/diagnose/build_context +++ b/k8s/diagnose/build_context @@ -29,6 +29,16 @@ PODS_FILE="$DATA_DIR/pods.json" kubectl get pods -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$PODS_FILE" || echo '{"items":[]}' > "$PODS_FILE" export PODS_FILE +# Deployments +DEPLOYMENTS_FILE="$DATA_DIR/deployments.json" +kubectl get deployment -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$DEPLOYMENTS_FILE" || echo '{"items":[]}' > "$DEPLOYMENTS_FILE" +export DEPLOYMENTS_FILE + +# ReplicaSets +REPLICASETS_FILE="$DATA_DIR/replicasets.json" +kubectl get rs -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$REPLICASETS_FILE" || echo '{"items":[]}' > "$REPLICASETS_FILE" +export REPLICASETS_FILE + # Services SERVICES_FILE="$DATA_DIR/services.json" kubectl get services -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$SERVICES_FILE" || echo '{"items":[]}' > "$SERVICES_FILE" @@ -82,4 +92,79 @@ if [[ -n "$ALB_POD_NAMES" ]]; then for POD_NAME in $ALB_POD_NAMES; do kubectl logs "$POD_NAME" -n "$ALB_CONTROLLER_NAMESPACE" --tail=200 2>/dev/null > "$ALB_CONTROLLER_LOGS_DIR/${POD_NAME}.log" || echo "" > "$ALB_CONTROLLER_LOGS_DIR/${POD_NAME}.log" done -fi \ No newline at end of file +fi + +# Identify problematic pods and capture their logs + describe +# A pod is "problematic" if any of: +# - phase is not Running and not Succeeded +# - it is being deleted (deletionTimestamp present) +# - Ready condition is not True +# - any container (init or regular) has crashed, restarted, terminated, or is in a known error waiting state +PROBLEMATIC_PODS_FILE="$DATA_DIR/problematic_pods.txt" +POD_LOGS_DIR="$DATA_DIR/pod_logs" +POD_DESCRIBE_DIR="$DATA_DIR/pod_describe" +mkdir -p "$POD_LOGS_DIR" "$POD_DESCRIBE_DIR" + +POD_LOG_TAIL_LINES="${POD_LOG_TAIL_LINES:-500}" + +PROBLEMATIC_POD_NAMES=$(jq -r ' + def is_error_waiting(reason): + reason | IN("CrashLoopBackOff","ImagePullBackOff","ErrImagePull","CreateContainerError","RunContainerError","CreateContainerConfigError"); + + def container_unhealthy(c): + (c.restartCount // 0) > 0 + or (c.state.terminated // null) != null + or (c.lastState.terminated // null) != null + or (c.state.waiting // null) != null and is_error_waiting(c.state.waiting.reason // ""); + + .items[] + | select( + (.status.phase != "Running" and .status.phase != "Succeeded") + or (.metadata.deletionTimestamp // null) != null + or ((.status.conditions // []) | any(.type == "Ready" and .status != "True")) + or ((.status.containerStatuses // []) | any(container_unhealthy(.))) + or ((.status.initContainerStatuses // []) | any(container_unhealthy(.))) + ) + | .metadata.name +' "$PODS_FILE" 2>/dev/null) + +echo "$PROBLEMATIC_POD_NAMES" > "$PROBLEMATIC_PODS_FILE" +export PROBLEMATIC_PODS_FILE +export POD_LOGS_DIR +export POD_DESCRIBE_DIR +export POD_LOG_TAIL_LINES + +if [[ -n "$PROBLEMATIC_POD_NAMES" ]]; then + for POD_NAME in $PROBLEMATIC_POD_NAMES; do + # Describe (full output, includes spec + status + events correlated) + kubectl describe pod "$POD_NAME" -n "$NAMESPACE" > "$POD_DESCRIBE_DIR/${POD_NAME}.txt" 2>/dev/null || echo "" > "$POD_DESCRIBE_DIR/${POD_NAME}.txt" + + # Capture logs for every container (init + regular), current and previous + ALL_CONTAINERS=$(jq -r --arg name "$POD_NAME" ' + .items[] + | select(.metadata.name == $name) + | ((.spec.initContainers // []) + (.spec.containers // [])) + | .[].name + ' "$PODS_FILE" 2>/dev/null) + + for CONTAINER_NAME in $ALL_CONTAINERS; do + CURRENT_LOG="$POD_LOGS_DIR/${POD_NAME}.${CONTAINER_NAME}.log" + PREVIOUS_LOG="$POD_LOGS_DIR/${POD_NAME}.${CONTAINER_NAME}.previous.log" + + # Current container logs (always kept, even if empty: "container produced no output yet" is meaningful) + kubectl logs "$POD_NAME" -n "$NAMESPACE" -c "$CONTAINER_NAME" --tail="$POD_LOG_TAIL_LINES" \ + > "$CURRENT_LOG" 2>/dev/null || echo "" > "$CURRENT_LOG" + + # Previous container logs (kubectl exits 1 when there is no previous instance — expected, ignore) + kubectl logs "$POD_NAME" -n "$NAMESPACE" -c "$CONTAINER_NAME" --tail="$POD_LOG_TAIL_LINES" --previous \ + > "$PREVIOUS_LOG" 2>/dev/null || true + if [[ ! -s "$PREVIOUS_LOG" ]]; then + rm -f "$PREVIOUS_LOG" + fi + done + done +fi + +# Always end with success: build_context is sourced, and a trailing non-zero status +# from any conditional above would propagate to the caller. +: \ No newline at end of file diff --git a/k8s/diagnose/logs/application_log_evidence b/k8s/diagnose/logs/application_log_evidence new file mode 100644 index 00000000..11744f83 --- /dev/null +++ b/k8s/diagnose/logs/application_log_evidence @@ -0,0 +1,114 @@ +#!/bin/bash +# Check: Application Log Evidence +# +# Purpose: surface the logs of the user-owned "application" container from +# each problematic pod so they are visible in the diagnose UI's check.logs[] +# view (the last 20 stdout lines of the check). +# +# Scope is intentionally narrow: +# - Only the container literally named "application" (the user code; see +# k8s/deployment/templates/deployment.yaml.tpl). Sidecars (e.g. "http" +# nginx) and init containers are out of scope here — the scope/ checks +# already report their findings without us duplicating logs across +# containers and probe rounds. +# - Logs are echoed to stdout only; the evidence payload carries only +# counters and the list of pods (in evidence.affected). Re-emitting the +# log text inside evidence.details would duplicate what already lives in +# the check's logs[] tail. +# +# Reads from the build_context snapshot (data/pod_logs/), never calls kubectl. +# Severity is always "info"; this check publishes context, it does not detect +# failure modes. + +APPLICATION_CONTAINER_NAME="application" + +if [[ ! -f "$PROBLEMATIC_PODS_FILE" ]]; then + print_warning "No problematic pods snapshot available, log evidence skipped" + SKIP_EVIDENCE=$(evidence_json \ + "Snapshot unavailable, log collection skipped" \ + "info" \ + "[]" \ + "$(jq -nc '{pods_with_logs: 0, problematic_pod_count: 0}')" \ + "[]") + update_check_result --status "skipped" --evidence "$SKIP_EVIDENCE" + return 0 +fi + +PROBLEMATIC_PODS=$(grep -v '^[[:space:]]*$' "$PROBLEMATIC_PODS_FILE" 2>/dev/null) + +if [[ -z "$PROBLEMATIC_PODS" ]]; then + print_success "No problematic pods detected — no application logs to collect" + SUMMARY="No problematic pods detected, no application logs to collect" + DETAILS=$(jq -nc '{pods_with_logs: 0, problematic_pod_count: 0}') + EVIDENCE=$(evidence_json "$SUMMARY" "info" "[]" "$DETAILS" "[]") + update_check_result --status "success" --evidence "$EVIDENCE" --log-tail-lines 200 + return 0 +fi + +AFFECTED_PODS="" +PODS_WITH_LOGS=0 +TOTAL_PROBLEMATIC=$(echo "$PROBLEMATIC_PODS" | wc -w | tr -d ' ') + +for POD_NAME in $PROBLEMATIC_PODS; do + HAS_APP_CONTAINER=$(jq -r --arg name "$POD_NAME" --arg cn "$APPLICATION_CONTAINER_NAME" ' + .items[] + | select(.metadata.name == $name) + | (.spec.containers // []) + | map(.name) + | index($cn) != null + ' "$PODS_FILE" 2>/dev/null) + + if [[ "$HAS_APP_CONTAINER" != "true" ]]; then + print_warning "Pod $POD_NAME has no '$APPLICATION_CONTAINER_NAME' container — skipped" + continue + fi + + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$APPLICATION_CONTAINER_NAME" "current") + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$APPLICATION_CONTAINER_NAME" "previous") + + # Merge previous + current in chronological order, then keep only the last + # N lines (defaults to 50). One flat array; the AI does not need to know + # which container instance produced which line. + TAIL_LINES="${EVIDENCE_LOG_TAIL_LINES:-50}" + MERGED_LOGS=$(jq -nc \ + --argjson prev "$PREVIOUS_LOGS" \ + --argjson curr "$CURRENT_LOGS" \ + --argjson n "$TAIL_LINES" \ + '($prev + $curr) | .[-$n:]') + + if [[ "$(echo "$MERGED_LOGS" | jq 'length')" -eq 0 ]]; then + print_warning "Pod $POD_NAME application container produced no logs" + continue + fi + + mark_affected AFFECTED_PODS "$POD_NAME" + PODS_WITH_LOGS=$((PODS_WITH_LOGS + 1)) + + # Echo the log tail to stdout so it surfaces in the UI's check.logs view. + # update_check_result is called below with --log-tail-lines 200 so the cap + # accommodates the application log payload (default cap is 20). This is + # the only place the log text lives — evidence.details stores only + # counters, so there is no duplication between evidence and logs[]. + print_info "─── application log tail from $POD_NAME ───" + echo "$MERGED_LOGS" | jq -r '.[] | " | \(.)"' +done + +AFFECTED_PODS_JSON=$(set_to_json_array AFFECTED_PODS) + +if [[ $PODS_WITH_LOGS -eq 0 ]]; then + SUMMARY="No application logs available across $TOTAL_PROBLEMATIC problematic pod(s) — image may never have started" + DETAILS=$(jq -nc \ + --argjson problematic "$TOTAL_PROBLEMATIC" \ + '{pods_with_logs: 0, problematic_pod_count: $problematic}') + EVIDENCE=$(evidence_json "$SUMMARY" "info" "[]" "$DETAILS" "[]") + update_check_result --status "success" --evidence "$EVIDENCE" --log-tail-lines 200 +else + SUMMARY="Collected application logs from $PODS_WITH_LOGS of $TOTAL_PROBLEMATIC problematic pod(s)" + DETAILS=$(jq -nc \ + --argjson with_logs "$PODS_WITH_LOGS" \ + --argjson problematic "$TOTAL_PROBLEMATIC" \ + '{pods_with_logs: $with_logs, problematic_pod_count: $problematic}') + EVIDENCE=$(evidence_json "$SUMMARY" "info" "$AFFECTED_PODS_JSON" "$DETAILS" "[]") + update_check_result --status "success" --evidence "$EVIDENCE" --log-tail-lines 200 + print_success "$SUMMARY" +fi diff --git a/k8s/diagnose/logs/workflow.yml b/k8s/diagnose/logs/workflow.yml new file mode 100644 index 00000000..3ea26aee --- /dev/null +++ b/k8s/diagnose/logs/workflow.yml @@ -0,0 +1,6 @@ +steps: + - name: Application Log Evidence + description: Collects pod logs from the diagnose snapshot for AI post-mortem analysis + category: Application Logs + type: script + file: "$SERVICE_PATH/diagnose/logs/application_log_evidence" diff --git a/k8s/diagnose/networking/alb_capacity_check b/k8s/diagnose/networking/alb_capacity_check index 445971f4..2431b28f 100644 --- a/k8s/diagnose/networking/alb_capacity_check +++ b/k8s/diagnose/networking/alb_capacity_check @@ -2,35 +2,39 @@ # Check: ALB Capacity Check # Checks for common ALB issues (IP exhaustion, certificate problems) -# Validate ingresses exist require_ingresses || return 0 -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" +HAS_IP_EXHAUSTION=0 +IP_EXHAUSTION_LOGS="[]" + -# Get ALB controller pods from pre-collected data ALB_CONTROLLER_PODS=$(jq -r '.items[].metadata.name' "$ALB_CONTROLLER_PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -n "$ALB_CONTROLLER_PODS" ]]; then for POD in $ALB_CONTROLLER_PODS; do - # Look for IP exhaustion errors in pre-collected controller logs LOG_FILE="$ALB_CONTROLLER_LOGS_DIR/${POD}.log" if [[ -f "$LOG_FILE" ]] && [[ -r "$LOG_FILE" ]]; then - # Use tail and awk to handle massive log lines efficiently IP_ERRORS=$(tail -n 500 "$LOG_FILE" 2>/dev/null | \ awk 'length <= 10000' 2>/dev/null | \ grep -iE "no available ip|insufficient ip|ip address.*(exhausted|unavailable)" 2>/dev/null || true) if [[ -n "$IP_ERRORS" ]]; then - HAS_ISSUES=1 + HAS_IP_EXHAUSTION=1 print_error " ALB subnet IP exhaustion detected, Recent logs:" if ! echo "$IP_ERRORS" | tail -n 3 2>/dev/null | cut -c1-200 2>/dev/null | sed 's/^/ /' 2>/dev/null; then print_warning " [Log details could not be displayed]" fi print_action "Check subnet CIDR ranges and consider expanding or using different subnets" print_info " Annotation: alb.ingress.kubernetes.io/subnets=" + + TRUNC=$(echo "$IP_ERRORS" | tail -n 3 | cut -c1-200 | jq -R . | jq -s .) + IP_EXHAUSTION_LOGS=$(echo "$IP_EXHAUSTION_LOGS" | jq --arg pod "$POD" --argjson lines "$TRUNC" \ + '. + [{controller_pod: $pod, lines: $lines}]') break fi elif [[ -e "$LOG_FILE" ]] && [[ ! -r "$LOG_FILE" ]]; then @@ -38,22 +42,32 @@ if [[ -n "$ALB_CONTROLLER_PODS" ]]; then fi done - if [[ -z "$IP_ERRORS" ]]; then + if [[ $HAS_IP_EXHAUSTION -eq 0 ]]; then print_success " No IP exhaustion issues detected" fi fi -# Consolidated loop: check all ingress-related issues in one pass +[[ $HAS_IP_EXHAUSTION -eq 1 ]] && { + ISSUE=$(jq -nc --argjson logs "$IP_EXHAUSTION_LOGS" \ + '{issue: "subnet_ip_exhaustion", evidence_logs: $logs}') + add_fact ISSUE_FACTS "$ISSUE" + # mark all ingresses as affected (cluster-wide issue) + for INGRESS_NAME in $INGRESSES; do mark_affected AFFECTED_INGRESSES "$INGRESS_NAME"; done +} + for INGRESS_NAME in $INGRESSES; do - # Get ingress info from pre-collected data (single read per ingress) INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - print_info "Checking ingress: $INGRESS_NAME" - # ===== TLS/Certificate Configuration Checks ===== + INGRESS_HAS_ISSUE=0 CERT_ARN=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/certificate-arn"] // empty') TLS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.tls[]?.hosts[]?' 2>/dev/null) INGRESS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[]?.host' 2>/dev/null) + SCHEME=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/scheme"] // empty') + SUBNETS=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/subnets"] // empty') + + CERT_ERROR_LINES="[]" + HOST_TLS_MISMATCHES="[]" if [[ -n "$TLS_HOSTS" || -n "$CERT_ARN" ]]; then print_info " SSL/TLS configured" @@ -61,42 +75,51 @@ for INGRESS_NAME in $INGRESSES; do if [[ -n "$CERT_ARN" ]]; then print_info " Certificate ARN: $CERT_ARN" - # Check controller logs for certificate errors if [[ -n "$ALB_CONTROLLER_PODS" ]]; then for POD in $ALB_CONTROLLER_PODS; do LOG_FILE="$ALB_CONTROLLER_LOGS_DIR/${POD}.log" if [[ -f "$LOG_FILE" ]] && [[ -r "$LOG_FILE" ]]; then - # Use tail and awk to handle massive log lines efficiently CERT_ERRORS=$(tail -n 500 "$LOG_FILE" 2>/dev/null | \ awk 'length <= 10000' 2>/dev/null | \ grep -iF "$INGRESS_NAME" 2>/dev/null | \ grep -iE "certificate.*(not found|invalid|failed|error)" 2>/dev/null || true) if [[ -n "$CERT_ERRORS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Certificate validation errors found:" if ! echo "$CERT_ERRORS" | tail -n 2 2>/dev/null | cut -c1-200 2>/dev/null | sed 's/^/ /' 2>/dev/null; then print_warning " [Certificate error details could not be displayed]" fi print_action "Verify certificate ARN exists in ACM and covers the required domains" + + TRUNC=$(echo "$CERT_ERRORS" | tail -n 2 | cut -c1-200 | jq -R . | jq -s .) + CERT_ERROR_LINES=$(echo "$CERT_ERROR_LINES" | jq --arg pod "$POD" --argjson lines "$TRUNC" \ + '. + [{controller_pod: $pod, lines: $lines}]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson lines "$TRUNC" \ + '{ingress: $ing, issue: "certificate_validation_errors", log_lines: $lines}') + add_fact ISSUE_FACTS "$ISSUE" fi fi done fi fi - # Verify hosts match between rules and TLS if [[ -n "$TLS_HOSTS" && -n "$INGRESS_HOSTS" ]]; then for HOST in $INGRESS_HOSTS; do if ! echo "$TLS_HOSTS" | grep -qw "$HOST"; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Host '$HOST' in rules but not in TLS configuration" print_action "Add host to spec.tls or ensure certificate covers this domain" + HOST_TLS_MISMATCHES=$(echo "$HOST_TLS_MISMATCHES" | jq --arg h "$HOST" '. + [$h]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg host "$HOST" \ + '{ingress: $ing, host: $host, issue: "host_in_rules_not_in_tls"}') + add_fact ISSUE_FACTS "$ISSUE" fi done fi - # Check for missing certificate when TLS is configured if [[ -n "$TLS_HOSTS" && -z "$CERT_ARN" ]]; then print_warning " TLS hosts configured but no ACM certificate ARN annotation" print_info " Add annotation: alb.ingress.kubernetes.io/certificate-arn=" @@ -105,8 +128,6 @@ for INGRESS_NAME in $INGRESSES; do print_info " No SSL/TLS configured (HTTP only)" fi - # ===== Events Checks (subnet, security group, target group) ===== - # Get events sorted by timestamp, most recent first EVENTS_JSON=$(jq --arg name "$INGRESS_NAME" --arg kind "Ingress" ' .items | map(select(.involvedObject.name == $name and .involvedObject.kind == $kind)) @@ -114,65 +135,114 @@ for INGRESS_NAME in $INGRESSES; do | reverse ' "$EVENTS_FILE" 2>/dev/null) + SUBNET_ERROR_LINES="[]" + SG_ERROR_LINES="[]" + TG_ERROR_LINES="[]" + EVENT_COUNT=$(echo "$EVENTS_JSON" | jq 'length' 2>/dev/null) if [[ "$EVENT_COUNT" -gt 0 ]]; then - # Get all error/warning events - ERROR_EVENTS=$(echo "$EVENTS_JSON" | jq -r ' - .[] - | select(.type == "Warning" or .type == "Error") - ' 2>/dev/null) - - if [[ -n "$ERROR_EVENTS" ]]; then - # Check for subnet errors - SUBNET_ERRORS=$(echo "$ERROR_EVENTS" | jq -r 'select(.message | test("subnet|availability zone"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) - if [[ -n "$SUBNET_ERRORS" ]]; then - HAS_ISSUES=1 - print_error " Subnet configuration issues" - if ! echo "$SUBNET_ERRORS" | head -n 2 2>/dev/null | sed 's/^/ /' 2>/dev/null; then - print_warning " [Event details could not be displayed]" - fi - fi + ERROR_EVENTS=$(echo "$EVENTS_JSON" | jq -c '[.[] | select(.type == "Warning" or .type == "Error")]' 2>/dev/null) + + SUBNET_ERRORS_HUMAN=$(echo "$ERROR_EVENTS" | jq -r '.[] | select(.message | test("subnet|availability zone"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) + if [[ -n "$SUBNET_ERRORS_HUMAN" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Subnet configuration issues" + echo "$SUBNET_ERRORS_HUMAN" | head -n 2 | sed 's/^/ /' + SUBNET_ERROR_LINES=$(echo "$ERROR_EVENTS" | jq -c '[.[] | select(.message | test("subnet|availability zone"; "i")) | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] | .[:2]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson events "$SUBNET_ERROR_LINES" \ + '{ingress: $ing, issue: "subnet_misconfiguration", events: $events}') + add_fact ISSUE_FACTS "$ISSUE" + fi - # Check for security group errors - SG_ERRORS=$(echo "$ERROR_EVENTS" | jq -r 'select(.message | test("security.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) - if [[ -n "$SG_ERRORS" ]]; then - HAS_ISSUES=1 - print_error " Security group issues" - if ! echo "$SG_ERRORS" | head -n 2 2>/dev/null | sed 's/^/ /' 2>/dev/null; then - print_warning " [Event details could not be displayed]" - fi - fi + SG_ERRORS_HUMAN=$(echo "$ERROR_EVENTS" | jq -r '.[] | select(.message | test("security.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) + if [[ -n "$SG_ERRORS_HUMAN" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Security group issues" + echo "$SG_ERRORS_HUMAN" | head -n 2 | sed 's/^/ /' + SG_ERROR_LINES=$(echo "$ERROR_EVENTS" | jq -c '[.[] | select(.message | test("security.?group"; "i")) | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] | .[:2]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson events "$SG_ERROR_LINES" \ + '{ingress: $ing, issue: "security_group", events: $events}') + add_fact ISSUE_FACTS "$ISSUE" + fi - # Check for target group errors - TG_ERRORS=$(echo "$ERROR_EVENTS" | jq -r 'select(.message | test("target.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) - if [[ -n "$TG_ERRORS" ]]; then - HAS_ISSUES=1 - print_error " Target group registration issues" - if ! echo "$TG_ERRORS" | head -n 2 2>/dev/null | sed 's/^/ /' 2>/dev/null; then - print_warning " [Event details could not be displayed]" - fi - fi + TG_ERRORS_HUMAN=$(echo "$ERROR_EVENTS" | jq -r '.[] | select(.message | test("target.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) + if [[ -n "$TG_ERRORS_HUMAN" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Target group registration issues" + echo "$TG_ERRORS_HUMAN" | head -n 2 | sed 's/^/ /' + TG_ERROR_LINES=$(echo "$ERROR_EVENTS" | jq -c '[.[] | select(.message | test("target.?group"; "i")) | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] | .[:2]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson events "$TG_ERROR_LINES" \ + '{ingress: $ing, issue: "target_group", events: $events}') + add_fact ISSUE_FACTS "$ISSUE" fi fi - # ===== Annotation Checks (scheme, subnets) ===== - SCHEME=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/scheme"] // empty') if [[ -z "$SCHEME" ]]; then print_warning " No scheme annotation (defaulting to internal)" print_info " Add annotation: alb.ingress.kubernetes.io/scheme=internet-facing (or internal)" fi - - SUBNETS=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/subnets"] // empty') if [[ -z "$SUBNETS" ]]; then print_info " Using auto-discovered subnets" print_info " Consider explicit subnets: alb.ingress.kubernetes.io/subnets=" fi + + INGRESS_FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cert "$CERT_ARN" --arg scheme "$SCHEME" --arg subnets "$SUBNETS" \ + --argjson tls_configured "$([[ -n "$TLS_HOSTS" || -n "$CERT_ARN" ]] && echo true || echo false)" \ + --argjson host_mismatches "$HOST_TLS_MISMATCHES" \ + --argjson cert_errors "$CERT_ERROR_LINES" \ + --argjson subnet_errors "$SUBNET_ERROR_LINES" \ + --argjson sg_errors "$SG_ERROR_LINES" \ + --argjson tg_errors "$TG_ERROR_LINES" \ + '{ + ingress: $ing, + tls_configured: $tls_configured, + certificate_arn: (if $cert == "" then null else $cert end), + scheme: (if $scheme == "" then null else $scheme end), + subnets_annotation: (if $subnets == "" then null else $subnets end), + host_tls_mismatches: $host_mismatches, + certificate_errors: $cert_errors, + subnet_errors: $subnet_errors, + security_group_errors: $sg_errors, + target_group_errors: $tg_errors + }') + add_fact INGRESS_FACTS "$INGRESS_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "No critical ALB capacity or configuration issues detected" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "No critical ALB capacity or configuration issues detected" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) affected by ALB issues" + [[ $HAS_IP_EXHAUSTION -eq 1 ]] && SUMMARY="$SUMMARY — subnet IP exhaustion detected" + + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + --argjson ip_logs "$IP_EXHAUSTION_LOGS" \ + --argjson ip_exhaustion "$HAS_IP_EXHAUSTION" \ + '{ + ingress_count: $count, + issue_count: ($issues | length), + ip_exhaustion_detected: ($ip_exhaustion == 1), + ip_exhaustion_logs: $ip_logs, + ingresses: $facts, + issues: $issues + }') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Check subnet capacity and certificate/security group configuration"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_backend_service b/k8s/diagnose/networking/ingress_backend_service index e20a570a..10ad9957 100644 --- a/k8s/diagnose/networking/ingress_backend_service +++ b/k8s/diagnose/networking/ingress_backend_service @@ -2,99 +2,108 @@ # Check: Ingress Backend Service # Checks if ingress backend services exist and are reachable -# Validate ingresses exist require_ingresses || return 0 -# Get ingresses INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" + for INGRESS_NAME in $INGRESSES; do INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - print_info "Checking backends for ingress: $INGRESS_NAME" - # Get default backend if exists + INGRESS_BACKEND_FACTS=() + INGRESS_HAS_ISSUE=0 + DEFAULT_BACKEND=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.name // empty') if [[ -n "$DEFAULT_BACKEND" ]]; then DEFAULT_PORT=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.port.number // .spec.defaultBackend.service.port.name // empty') - - # Check if service exists in pre-collected data SERVICE_INFO=$(jq --arg name "$DEFAULT_BACKEND" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) if [[ -n "$SERVICE_INFO" && "$SERVICE_INFO" != "null" ]]; then - # Check if service has endpoints from pre-collected data ENDPOINT_INFO=$(jq --arg name "$DEFAULT_BACKEND" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null) ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[].addresses[].ip' 2>/dev/null | tr '\n' ' ') if [[ -n "$ENDPOINTS" ]]; then print_success " Default backend: $DEFAULT_BACKEND:$DEFAULT_PORT (has endpoints)" + BFACT=$(jq -nc --arg svc "$DEFAULT_BACKEND" --arg port "$DEFAULT_PORT" \ + '{kind: "default", service: $svc, port: $port, status: "ok"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Default backend: $DEFAULT_BACKEND:$DEFAULT_PORT (no endpoints)" + BFACT=$(jq -nc --arg svc "$DEFAULT_BACKEND" --arg port "$DEFAULT_PORT" \ + '{kind: "default", service: $svc, port: $port, status: "no_endpoints"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$DEFAULT_BACKEND" \ + '{ingress: $ing, backend: $svc, issue: "default_backend_no_endpoints"}') + add_fact ISSUE_FACTS "$ISSUE" fi else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Default backend: Service '$DEFAULT_BACKEND' not found" + BFACT=$(jq -nc --arg svc "$DEFAULT_BACKEND" \ + '{kind: "default", service: $svc, status: "service_not_found"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$DEFAULT_BACKEND" \ + '{ingress: $ing, backend: $svc, issue: "default_backend_not_found"}') + add_fact ISSUE_FACTS "$ISSUE" fi + add_fact INGRESS_BACKEND_FACTS "$BFACT" fi - # Get all rule backends BACKENDS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[].http.paths[] | "\(.backend.service.name):\(.backend.service.port.number // .backend.service.port.name)"' 2>/dev/null) - if [[ -z "$BACKENDS" ]]; then + if [[ -z "$BACKENDS" && -z "$DEFAULT_BACKEND" ]]; then print_warning " No path rules defined" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --argjson backends "$INGRESS_BACKEND_FACTS" \ + '{ingress: $ing, backends: $backends}') + add_fact INGRESS_FACTS "$FACT" continue fi - # Check each unique backend - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates while IFS=':' read -r SERVICE_NAME SERVICE_PORT; do - # Check if service exists in pre-collected data + [[ -z "$SERVICE_NAME" ]] && continue + SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) if [[ -n "$SERVICE_INFO" && "$SERVICE_INFO" != "null" ]]; then - # Check if service has endpoints from pre-collected data ENDPOINT_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null) - READY_ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[]?.addresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) - NOT_READY_ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[]?.notReadyAddresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) + READY_COUNT=$(echo "$ENDPOINT_INFO" | jq -r '[.subsets[]?.addresses[]?] | length' 2>/dev/null) + NOT_READY_COUNT=$(echo "$ENDPOINT_INFO" | jq -r '[.subsets[]?.notReadyAddresses[]?] | length' 2>/dev/null) + READY_COUNT=${READY_COUNT:-0} + NOT_READY_COUNT=${NOT_READY_COUNT:-0} - # Get port info PORT_NUMBER=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[0]?.ports[0]?.port // empty' 2>/dev/null) - READY_COUNT=$(echo "$READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) - NOT_READY_COUNT=$(echo "$NOT_READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) - if [[ $READY_COUNT -gt 0 ]]; then print_success " Backend: $SERVICE_NAME:$SERVICE_PORT ($READY_COUNT ready endpoint(s))" - echo "$READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - [[ -n "$IP" ]] && print_success " - $POD_NAME -> $IP:$PORT_NUMBER" + echo "$ENDPOINT_INFO" | jq -r --arg p "$PORT_NUMBER" '.subsets[]?.addresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)" + (if $p == "" then "" else ":" + $p end)' | while IFS= read -r line; do + print_success "$line" done - if [[ $NOT_READY_COUNT -gt 0 ]]; then print_warning " Also has $NOT_READY_COUNT not ready endpoint(s)" - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - [[ -n "$IP" ]] && print_warning " - $POD_NAME -> $IP:$PORT_NUMBER" + echo "$ENDPOINT_INFO" | jq -r --arg p "$PORT_NUMBER" '.subsets[]?.notReadyAddresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)" + (if $p == "" then "" else ":" + $p end)' | while IFS= read -r line; do + print_warning "$line" done fi + BFACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" --argjson ready "$READY_COUNT" --argjson nr "$NOT_READY_COUNT" \ + '{kind: "rule", service: $svc, port: $port, ready_count: $ready, not_ready_count: $nr, status: "ok"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Backend: $SERVICE_NAME:$SERVICE_PORT (no ready endpoints)" - # Get service selector to help debug SERVICE_SELECTOR=$(echo "$SERVICE_INFO" | jq -c '.spec.selector // {}' 2>/dev/null) - print_info " Service selector: $SERVICE_SELECTOR" if [[ $NOT_READY_COUNT -gt 0 ]]; then - print_warning " Found $NOT_READY_COUNT not ready endpoint(s):" - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - [[ -n "$IP" ]] && print_warning " - $POD_NAME -> $IP:$PORT_NUMBER (not ready)" - done + print_warning " Found $NOT_READY_COUNT not ready endpoint(s)" print_action "Check pod readiness - pods exist but are not ready to serve traffic" + SUB_ISSUE="endpoints_not_ready" else - # Check if there are any pods matching the selector if [[ "$SERVICE_SELECTOR" != "{}" && "$SERVICE_SELECTOR" != "null" ]]; then MATCHING_PODS=$(jq -r --argjson selectors "$SERVICE_SELECTOR" ' .items[] | @@ -108,38 +117,77 @@ for INGRESS_NAME in $INGRESSES; do ' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -n "$MATCHING_PODS" ]]; then - print_warning " Found pods matching selector but no endpoints: $MATCHING_PODS" - print_action "Pods exist but endpoints not created - check pod readiness probes and status" + print_warning " Pods match selector but no endpoints — check readiness probes" + SUB_ISSUE="pods_match_no_endpoints" else print_warning " No pods found matching service selector" - print_action "Create pods with labels matching the service selector: $SERVICE_SELECTOR" + SUB_ISSUE="no_matching_pods" fi else print_warning " Service has no selector defined" - print_action "Add selector to service or check if this is a headless/ExternalName service" + SUB_ISSUE="no_selector" fi fi + + BFACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" --argjson nr "$NOT_READY_COUNT" \ + --argjson selector "$SERVICE_SELECTOR" --arg issue "$SUB_ISSUE" \ + '{kind: "rule", service: $svc, port: $port, ready_count: 0, not_ready_count: $nr, selector: $selector, status: $issue}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$SERVICE_NAME" --arg issue "$SUB_ISSUE" \ + '{ingress: $ing, backend: $svc, issue: $issue}') + add_fact ISSUE_FACTS "$ISSUE" fi - # Verify port exists in service from pre-collected data SERVICE_PORTS=$(echo "$SERVICE_INFO" | jq -r '.spec.ports[].port' 2>/dev/null | tr '\n' ' ') - if ! echo "$SERVICE_PORTS" | grep -qw "$SERVICE_PORT"; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Backend: Port $SERVICE_PORT not found in service $SERVICE_NAME" print_warning " Available ports: $SERVICE_PORTS" + BFACT=$(echo "$BFACT" | jq --arg sp "$SERVICE_PORT" --arg ports "$SERVICE_PORTS" \ + '. + {port_status: "port_not_in_service", available_ports: ($ports | split(" ") | map(select(length > 0)))}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" \ + '{ingress: $ing, backend: $svc, port: $port, issue: "port_not_in_service"}') + add_fact ISSUE_FACTS "$ISSUE" fi else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Backend: Service '$SERVICE_NAME' not found in namespace" + BFACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" \ + '{kind: "rule", service: $svc, port: $port, status: "service_not_found"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$SERVICE_NAME" \ + '{ingress: $ing, backend: $svc, issue: "service_not_found"}') + add_fact ISSUE_FACTS "$ISSUE" fi + + add_fact INGRESS_BACKEND_FACTS "$BFACT" done < <(echo "$BACKENDS" | sort -u) + + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --argjson backends "$INGRESS_BACKEND_FACTS" \ + '{ingress: $ing, backends: $backends}') + add_fact INGRESS_FACTS "$FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All backend services healthy for $INGRESS_COUNT ingress(es)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All backend services healthy for $INGRESS_COUNT ingress(es)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have backend issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Verify backend services exist and have ready endpoints, and that ports match"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_class_validation b/k8s/diagnose/networking/ingress_class_validation index d796fd91..36ab7274 100644 --- a/k8s/diagnose/networking/ingress_class_validation +++ b/k8s/diagnose/networking/ingress_class_validation @@ -2,63 +2,87 @@ # Check: Ingress Class Validation # Validates ingress class is correctly configured -# Validate ingresses exist require_ingresses || return 0 -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +CLASS_FACTS=() +AFFECTED_INGRESSES="" + -# Get available ingress classes from pre-collected data AVAILABLE_CLASSES=$(jq -r '.items[].metadata.name' "$INGRESSCLASSES_FILE" 2>/dev/null | tr '\n' ' ') +AVAILABLE_CLASSES_JSON=$(jq -c '[.items[].metadata.name]' "$INGRESSCLASSES_FILE" 2>/dev/null) DEFAULT_CLASS=$(jq -r '.items[] | select(.metadata.annotations."ingressclass.kubernetes.io/is-default-class" == "true") | .metadata.name' "$INGRESSCLASSES_FILE" 2>/dev/null) for INGRESS_NAME in $INGRESSES; do - # Get ingress info from pre-collected data INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - # Check spec.ingressClassName (new way) INGRESS_CLASS=$(echo "$INGRESS_INFO" | jq -r '.spec.ingressClassName // empty') + USED_DEPRECATED=false - # Check annotation (old way) if [[ -z "$INGRESS_CLASS" ]]; then INGRESS_CLASS=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["kubernetes.io/ingress.class"] // empty') - if [[ -n "$INGRESS_CLASS" ]]; then print_info "Ingress $INGRESS_NAME: Using deprecated annotation (kubernetes.io/ingress.class)" + USED_DEPRECATED=true fi fi if [[ -z "$INGRESS_CLASS" ]]; then if [[ -n "$DEFAULT_CLASS" ]]; then print_success "Ingress $INGRESS_NAME: Using default IngressClass ($DEFAULT_CLASS)" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cls "$DEFAULT_CLASS" \ + '{ingress: $ing, ingress_class: $cls, source: "default"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" print_error "Ingress $INGRESS_NAME: No IngressClass specified and no default found" print_action "Specify ingressClassName or set a default IngressClass" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" \ + '{ingress: $ing, issue: "no_ingress_class_no_default"}') fi else - # Verify the class exists if echo "$AVAILABLE_CLASSES" | grep -qw "$INGRESS_CLASS"; then print_success "Ingress $INGRESS_NAME: IngressClass '$INGRESS_CLASS' is valid" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cls "$INGRESS_CLASS" --argjson dep "$USED_DEPRECATED" \ + '{ingress: $ing, ingress_class: $cls, used_deprecated_annotation: $dep, source: "explicit"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" print_error "Ingress $INGRESS_NAME: IngressClass '$INGRESS_CLASS' not found" - if [[ -n "$AVAILABLE_CLASSES" ]]; then print_warning " Available classes: $AVAILABLE_CLASSES" else print_warning " No IngressClasses found in cluster" fi + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cls "$INGRESS_CLASS" \ + --argjson available "$AVAILABLE_CLASSES_JSON" --argjson dep "$USED_DEPRECATED" \ + '{ingress: $ing, ingress_class: $cls, issue: "ingress_class_not_found", available_classes: $available, used_deprecated_annotation: $dep}') fi fi + add_fact CLASS_FACTS "$FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All $INGRESS_COUNT ingress(es) have valid IngressClass configuration" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $INGRESS_COUNT ingress(es) have valid IngressClass configuration" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array CLASS_FACTS)" --argjson count "$INGRESS_COUNT" --arg default "$DEFAULT_CLASS" \ + '{ingress_count: $count, default_class: $default, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have IngressClass issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array CLASS_FACTS)" \ + --argjson available "$AVAILABLE_CLASSES_JSON" \ + --arg default "$DEFAULT_CLASS" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, available_classes: $available, default_class: $default, ingresses: $facts}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Specify ingressClassName or set a default IngressClass"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_controller_sync b/k8s/diagnose/networking/ingress_controller_sync index 1f0a40d6..ab29065d 100644 --- a/k8s/diagnose/networking/ingress_controller_sync +++ b/k8s/diagnose/networking/ingress_controller_sync @@ -2,15 +2,15 @@ # Check: Ingress Controller Sync # Verifies ALB ingress controller has synchronized successfully -# Validate ingresses exist require_ingresses || return 0 -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" + -# Get ALB controller pods from pre-collected data ALB_CONTROLLER_PODS=$(jq -r '.items[].metadata.name' "$ALB_CONTROLLER_PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$ALB_CONTROLLER_PODS" ]]; then @@ -23,7 +23,6 @@ fi for INGRESS_NAME in $INGRESSES; do print_info "Checking sync status for ingress: $INGRESS_NAME" - # Get ingress events from pre-collected data - sorted by timestamp, most recent first INGRESS_EVENTS_JSON=$(jq --arg name "$INGRESS_NAME" --arg kind "Ingress" ' .items | map(select(.involvedObject.name == $name and .involvedObject.kind == $kind)) @@ -32,47 +31,62 @@ for INGRESS_NAME in $INGRESSES; do ' "$EVENTS_FILE" 2>/dev/null) EVENT_COUNT=$(echo "$INGRESS_EVENTS_JSON" | jq 'length' 2>/dev/null) + EVENT_SUMMARY="null" + INGRESS_HAS_ISSUE=0 + DETECTED_PROBLEMS="[]" + LATEST_ERROR_EVENTS="[]" if [[ "$EVENT_COUNT" -gt 0 ]]; then - # Get the most recent event NEWEST_EVENT=$(echo "$INGRESS_EVENTS_JSON" | jq -r 'first') EVENT_TYPE=$(echo "$NEWEST_EVENT" | jq -r '.type') EVENT_REASON=$(echo "$NEWEST_EVENT" | jq -r '.reason') EVENT_MESSAGE=$(echo "$NEWEST_EVENT" | jq -r '.message') EVENT_TIMESTAMP=$(echo "$NEWEST_EVENT" | jq -r '.lastTimestamp') - # Check for successful reconciliation first + EVENT_SUMMARY=$(jq -nc --arg t "$EVENT_TYPE" --arg r "$EVENT_REASON" --arg m "$EVENT_MESSAGE" --arg ts "$EVENT_TIMESTAMP" \ + '{type: $t, reason: $r, message: $m, last_timestamp: $ts}') + if [[ "$EVENT_REASON" == "SuccessfullyReconciled" ]]; then print_success " ✓ Successfully reconciled at $EVENT_TIMESTAMP" elif [[ "$EVENT_TYPE" == "Normal" ]] && echo "$EVENT_REASON" | grep -qiE "ensured|synced"; then print_success " ✓ Last event: $EVENT_REASON at $EVENT_TIMESTAMP" else - # Look for error/warning events in recent history ERROR_EVENTS=$(echo "$INGRESS_EVENTS_JSON" | jq -r ' .[] | select(.type == "Warning" or .type == "Error") | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)" ' | head -n 5) + LATEST_ERROR_EVENTS=$(echo "$INGRESS_EVENTS_JSON" | jq -c ' + [.[] + | select(.type == "Warning" or .type == "Error") + | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] + | .[:5] + ') if [[ -n "$ERROR_EVENTS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Found error/warning events:" echo "$ERROR_EVENTS" | sed 's/^/ /' - # Check for specific ALB errors in all error events ALL_ERROR_MESSAGES=$(echo "$INGRESS_EVENTS_JSON" | jq -r '.[] | select(.type == "Warning" or .type == "Error") | .message' 2>/dev/null) if echo "$ALL_ERROR_MESSAGES" | grep -qi "failed to reconcile"; then print_error " Issue: Failed to reconcile ingress" + DETECTED_PROBLEMS=$(echo "$DETECTED_PROBLEMS" | jq '. + ["failed_to_reconcile"]') fi - if echo "$ALL_ERROR_MESSAGES" | grep -qi "no available ip\|insufficient.*address"; then print_error " Issue: No available IPs in subnet (see alb_capacity_check)" + DETECTED_PROBLEMS=$(echo "$DETECTED_PROBLEMS" | jq '. + ["subnet_ip_exhaustion"]') fi - if echo "$ALL_ERROR_MESSAGES" | grep -qi "certificate\|tls.*secret"; then print_error " Issue: Certificate problem detected (see ingress_tls_configuration)" + DETECTED_PROBLEMS=$(echo "$DETECTED_PROBLEMS" | jq '. + ["certificate_issue"]') fi + + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson problems "$DETECTED_PROBLEMS" --argjson events "$LATEST_ERROR_EVENTS" \ + '{ingress: $ing, issue: "sync_errors", detected_problems: $problems, recent_events: $events}') + add_fact ISSUE_FACTS "$ISSUE" else print_info " Last event: $EVENT_REASON at $EVENT_TIMESTAMP" fi @@ -81,32 +95,37 @@ for INGRESS_NAME in $INGRESSES; do print_warning " No events found for this ingress" fi - # Check ALB controller logs if pods are found + CONTROLLER_LOG_ERRORS="[]" if [[ -n "$ALB_CONTROLLER_PODS" ]]; then for POD in $ALB_CONTROLLER_PODS; do - # Get recent logs related to this ingress from pre-collected logs LOG_FILE="$ALB_CONTROLLER_LOGS_DIR/${POD}.log" if [[ -f "$LOG_FILE" ]] && [[ -r "$LOG_FILE" ]]; then - # Use tail to limit log size and grep with line-buffered to avoid memory issues - # Skip lines longer than 10000 chars to avoid processing massive JSON lines CONTROLLER_LOGS=$(tail -n 500 "$LOG_FILE" 2>/dev/null | \ awk 'length <= 10000' 2>/dev/null | \ grep -iF "$INGRESS_NAME" 2>/dev/null || true) if [[ -n "$CONTROLLER_LOGS" ]]; then - # Look for errors in controller logs (excluding "successfully built model" info logs) ERROR_LOGS=$(echo "$CONTROLLER_LOGS" | \ grep -ivE "successfully built model|successfully reconciled" 2>/dev/null | \ grep -iE "level.*error|level.*warn|failed|warning" 2>/dev/null | \ head -n 5 || true) if [[ -n "$ERROR_LOGS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Found errors in ALB controller logs:" - # Safely print error logs with proper error handling and truncation if ! echo "$ERROR_LOGS" | head -n 3 2>/dev/null | cut -c1-200 2>/dev/null | sed 's/^/ /' 2>/dev/null; then print_warning " [Error logs could not be displayed due to formatting issues]" fi + + # Add up to 3 truncated error log lines to facts + TRUNC_LOGS=$(echo "$ERROR_LOGS" | head -n 3 | cut -c1-200 | jq -R . | jq -s .) + CONTROLLER_LOG_ERRORS=$(echo "$CONTROLLER_LOG_ERRORS" | jq --arg pod "$POD" --argjson lines "$TRUNC_LOGS" \ + '. + [{pod: $pod, lines: $lines}]') + + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg pod "$POD" --argjson lines "$TRUNC_LOGS" \ + '{ingress: $ing, issue: "controller_log_errors", controller_pod: $pod, log_lines: $lines}') + add_fact ISSUE_FACTS "$ISSUE" else print_success " No errors in ALB controller logs for this ingress" fi @@ -119,22 +138,46 @@ for INGRESS_NAME in $INGRESSES; do done fi - # Check ingress status/address from pre-collected data INGRESS_ADDRESS=$(jq -r --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name) | .status.loadBalancer.ingress[0].hostname // empty' "$INGRESSES_FILE" 2>/dev/null) if [[ -z "$INGRESS_ADDRESS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " ALB address not assigned yet (sync may be in progress or failing)" print_action "Check ingress controller logs and verify backend services are healthy" + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" '{ingress: $ing, issue: "alb_address_not_assigned"}') + add_fact ISSUE_FACTS "$ISSUE" else print_success " ALB address assigned: $INGRESS_ADDRESS" fi + + INGRESS_FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg addr "$INGRESS_ADDRESS" \ + --argjson event "$EVENT_SUMMARY" --argjson detected "$DETECTED_PROBLEMS" \ + --argjson controller_errors "$CONTROLLER_LOG_ERRORS" --argjson recent "$LATEST_ERROR_EVENTS" \ + '{ingress: $ing, address: (if $addr == "" then null else $addr end), latest_event: $event, detected_problems: $detected, controller_log_errors: $controller_errors, recent_error_events: $recent}') + add_fact INGRESS_FACTS "$INGRESS_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All $INGRESS_COUNT ingress(es) synchronized successfully with controller" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $INGRESS_COUNT ingress(es) synchronized successfully" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have controller sync issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Check ingress controller logs and verify backend services are healthy"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_existence b/k8s/diagnose/networking/ingress_existence index 48654bcc..a14871ec 100644 --- a/k8s/diagnose/networking/ingress_existence +++ b/k8s/diagnose/networking/ingress_existence @@ -2,24 +2,39 @@ # Check: Ingress Existence # Verifies that ingress resources exist in the namespace -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$INGRESSES" ]]; then print_error "No ingresses found with labels $SCOPE_LABEL_SELECTOR in namespace $NAMESPACE" print_action "Create ingress resource to expose services externally" - update_check_result --status "failed" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "No ingresses found in namespace $NAMESPACE" \ + "critical" \ + "[]" \ + "$(jq -nc --arg ls "$SCOPE_LABEL_SELECTOR" --arg ns "$NAMESPACE" '{label_selector: $ls, namespace: $ns}')" \ + '["Create ingress resource to expose services externally"]') + update_check_result --status "failed" --evidence "$EVIDENCE" return 1 fi -INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') print_success "Found $INGRESS_COUNT ingress(es): $INGRESSES" +# Build hosts info per ingress +INGRESS_DETAILS=$(jq -c '[.items[] | {name: .metadata.name, hosts: [.spec.rules[]?.host // empty]}]' "$INGRESSES_FILE" 2>/dev/null) + # Show basic ingress info for INGRESS_NAME in $INGRESSES; do - # Get hosts from pre-collected data HOSTS=$(jq -r --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name) | .spec.rules[].host' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') print_info " $INGRESS_NAME hosts: $HOSTS" done -update_check_result --status "success" --evidence "{}" \ No newline at end of file +EVIDENCE=$(evidence_json \ + "Found $INGRESS_COUNT ingress(es) in namespace $NAMESPACE" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$INGRESS_COUNT" --argjson ingresses "$INGRESS_DETAILS" --arg ns "$NAMESPACE" \ + '{ingress_count: $count, ingresses: $ingresses, namespace: $ns}')" \ + "[]") +update_check_result --status "success" --evidence "$EVIDENCE" diff --git a/k8s/diagnose/networking/ingress_host_rules b/k8s/diagnose/networking/ingress_host_rules index 25fd223f..09e6b89b 100644 --- a/k8s/diagnose/networking/ingress_host_rules +++ b/k8s/diagnose/networking/ingress_host_rules @@ -2,20 +2,20 @@ # Check: Ingress Host Rules # Verifies host and path rules are properly configured -# Validate ingresses exist require_ingresses || return 0 -# Get ingresses INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" + for INGRESS_NAME in $INGRESSES; do INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) print_info "Checking host rules for ingress: $INGRESS_NAME" - # Get ingress address/status INGRESS_ADDRESS=$(echo "$INGRESS_INFO" | jq -r '.status.loadBalancer.ingress[0].ip // .status.loadBalancer.ingress[0].hostname // empty') if [[ -z "$INGRESS_ADDRESS" ]]; then @@ -24,84 +24,121 @@ for INGRESS_NAME in $INGRESSES; do print_info " Ingress address: $INGRESS_ADDRESS" fi - # Check if there are any rules RULE_COUNT=$(echo "$INGRESS_INFO" | jq '.spec.rules | length' 2>/dev/null) + DEFAULT_BACKEND=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.name // empty') - if [[ "$RULE_COUNT" -eq 0 ]]; then - # Check for default backend - DEFAULT_BACKEND=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.name // empty') + INGRESS_RULE_FACTS=() + INGRESS_HAS_ISSUE=0 + if [[ "$RULE_COUNT" -eq 0 ]]; then if [[ -n "$DEFAULT_BACKEND" ]]; then print_success " Catch-all rule using default backend: $DEFAULT_BACKEND" else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " No rules and no default backend configured" print_action "Add at least one rule or configure default backend" - fi - continue - fi - - # Check each rule - RULES=$(echo "$INGRESS_INFO" | jq -c '.spec.rules[]' 2>/dev/null) - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates - while read -r RULE; do - HOST=$(echo "$RULE" | jq -r '.host // "*"') - - # Check if host is defined - if [[ "$HOST" == "*" ]]; then - print_warning " Host: * (catch-all, consider specifying a hostname)" - else - print_success " Host: $HOST" + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" '{ingress: $ing, issue: "no_rules_no_default_backend"}') + add_fact ISSUE_FACTS "$ISSUE" fi + else + RULES=$(echo "$INGRESS_INFO" | jq -c '.spec.rules[]' 2>/dev/null) - # Check paths - PATHS=$(echo "$RULE" | jq -c '.http.paths[]' 2>/dev/null) + while read -r RULE; do + HOST=$(echo "$RULE" | jq -r '.host // "*"') - if [[ -z "$PATHS" ]]; then - HAS_ISSUES=1 - print_error " No paths defined for host $HOST" - print_action "Define at least one path for this host" - continue - fi + if [[ "$HOST" == "*" ]]; then + print_warning " Host: * (catch-all, consider specifying a hostname)" + else + print_success " Host: $HOST" + fi - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates - while read -r PATH_RULE; do - PATH_VALUE=$(echo "$PATH_RULE" | jq -r '.path // "/"') - PATH_TYPE=$(echo "$PATH_RULE" | jq -r '.pathType // "Prefix"') - BACKEND_SERVICE=$(echo "$PATH_RULE" | jq -r '.backend.service.name') - BACKEND_PORT=$(echo "$PATH_RULE" | jq -r '.backend.service.port.number // .backend.service.port.name') + PATHS=$(echo "$RULE" | jq -c '.http.paths[]' 2>/dev/null) + HOST_PATH_FACTS=() - print_info " Path: $PATH_VALUE ($PATH_TYPE) -> $BACKEND_SERVICE:$BACKEND_PORT" + if [[ -z "$PATHS" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " No paths defined for host $HOST" + print_action "Define at least one path for this host" - # Validate pathType - if [[ "$PATH_TYPE" != "Exact" && "$PATH_TYPE" != "Prefix" && "$PATH_TYPE" != "ImplementationSpecific" ]]; then - HAS_ISSUES=1 - print_error " Invalid pathType: $PATH_TYPE (must be Exact, Prefix, or ImplementationSpecific)" - print_action "Use valid pathType value" - fi + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg host "$HOST" \ + '{ingress: $ing, host: $host, issue: "no_paths_for_host"}') + add_fact ISSUE_FACTS "$ISSUE" - # Warn about path conventions - if [[ "$PATH_TYPE" == "Prefix" && "$PATH_VALUE" != "/" && ! "$PATH" =~ ^/.*[^/]$ ]]; then - print_warning " Path ends with '/' - this may cause routing issues with Prefix type" + RULE_FACT=$(jq -nc --arg host "$HOST" '{host: $host, paths: []}') + add_fact INGRESS_RULE_FACTS "$RULE_FACT" + continue fi - done < <(echo "$PATHS") - done < <(echo "$RULES") - # Check for conflicting rules - HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[].host' 2>/dev/null | sort) - DUPLICATE_HOSTS=$(echo "$HOSTS" | uniq -d) - - if [[ -n "$DUPLICATE_HOSTS" ]]; then - print_warning " Duplicate host rules found: $DUPLICATE_HOSTS" - print_info " Multiple path rules for the same host are OK, but verify they don't conflict" + while read -r PATH_RULE; do + PATH_VALUE=$(echo "$PATH_RULE" | jq -r '.path // "/"') + PATH_TYPE=$(echo "$PATH_RULE" | jq -r '.pathType // "Prefix"') + BACKEND_SERVICE=$(echo "$PATH_RULE" | jq -r '.backend.service.name') + BACKEND_PORT=$(echo "$PATH_RULE" | jq -r '.backend.service.port.number // .backend.service.port.name') + + print_info " Path: $PATH_VALUE ($PATH_TYPE) -> $BACKEND_SERVICE:$BACKEND_PORT" + + PATH_VALID=true + if [[ "$PATH_TYPE" != "Exact" && "$PATH_TYPE" != "Prefix" && "$PATH_TYPE" != "ImplementationSpecific" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + PATH_VALID=false + print_error " Invalid pathType: $PATH_TYPE (must be Exact, Prefix, or ImplementationSpecific)" + print_action "Use valid pathType value" + + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg host "$HOST" --arg path "$PATH_VALUE" --arg pt "$PATH_TYPE" \ + '{ingress: $ing, host: $host, path: $path, path_type: $pt, issue: "invalid_path_type"}') + add_fact ISSUE_FACTS "$ISSUE" + fi + + PATH_FACT=$(jq -nc --arg path "$PATH_VALUE" --arg pt "$PATH_TYPE" \ + --arg svc "$BACKEND_SERVICE" --arg port "$BACKEND_PORT" --argjson valid "$PATH_VALID" \ + '{path: $path, path_type: $pt, backend_service: $svc, backend_port: $port, valid: $valid}') + add_fact HOST_PATH_FACTS "$PATH_FACT" + done < <(echo "$PATHS") + + RULE_FACT=$(jq -nc --arg host "$HOST" --argjson paths "$HOST_PATH_FACTS" \ + '{host: $host, paths: $paths}') + add_fact INGRESS_RULE_FACTS "$RULE_FACT" + done < <(echo "$RULES") + + HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[].host' 2>/dev/null | sort) + DUPLICATE_HOSTS=$(echo "$HOSTS" | uniq -d) + + if [[ -n "$DUPLICATE_HOSTS" ]]; then + print_warning " Duplicate host rules found: $DUPLICATE_HOSTS" + print_info " Multiple path rules for the same host are OK, but verify they don't conflict" + fi fi + + INGRESS_FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg addr "$INGRESS_ADDRESS" --arg backend "$DEFAULT_BACKEND" \ + --argjson rules "$INGRESS_RULE_FACTS" \ + '{ingress: $ing, address: $addr, default_backend: (if $backend == "" then null else $backend end), rules: $rules}') + add_fact INGRESS_FACTS "$INGRESS_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "Host and path rules valid for all $INGRESS_COUNT ingress(es)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "Host and path rules valid for all $INGRESS_COUNT ingress(es)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have host/path rule issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Add at least one rule or configure default backend; use valid pathType values"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_tls_configuration b/k8s/diagnose/networking/ingress_tls_configuration index 1062a5d0..1ffdc893 100644 --- a/k8s/diagnose/networking/ingress_tls_configuration +++ b/k8s/diagnose/networking/ingress_tls_configuration @@ -2,92 +2,126 @@ # Check: Ingress TLS Configuration # Validates TLS/SSL certificate configuration -# Validate ingresses exist require_ingresses || return 0 -# Get ingresses INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +TLS_FACTS=() +AFFECTED_INGRESSES="" + for INGRESS_NAME in $INGRESSES; do INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - # Check if TLS is configured TLS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.tls[]?.hosts[]?' 2>/dev/null) if [[ -z "$TLS_HOSTS" ]]; then print_info "Ingress $INGRESS_NAME: No TLS configuration (HTTP only)" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" '{ingress: $ing, tls_configured: false}') + add_fact TLS_FACTS "$FACT" continue fi print_info "Checking TLS configuration for ingress: $INGRESS_NAME" - - # Get TLS secrets TLS_SECRETS=$(echo "$INGRESS_INFO" | jq -r '.spec.tls[] | "\(.secretName):\(.hosts | join(","))"' 2>/dev/null) - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates + SECRETS_INFO="[]" + INGRESS_HAS_ISSUE=0 + while IFS=':' read -r SECRET_NAME HOSTS; do - # Check if secret exists in pre-collected data SECRET_INFO=$(jq --arg name "$SECRET_NAME" '.items[] | select(.metadata.name == $name)' "$SECRETS_FILE" 2>/dev/null) if [[ -n "$SECRET_INFO" && "$SECRET_INFO" != "null" ]]; then SECRET_TYPE=$(echo "$SECRET_INFO" | jq -r '.type') if [[ "$SECRET_TYPE" == "kubernetes.io/tls" ]]; then - # Check if secret has required keys (metadata only, no actual data) - SECRET_KEYS=$(echo "$SECRET_INFO" | jq -r '.metadata.annotations | keys[]' 2>/dev/null) - - HAS_CERT=$(echo "$SECRET_KEYS" | grep -q "tls.crt" && echo "yes" || echo "no") - HAS_KEY=$(echo "$SECRET_KEYS" | grep -q "tls.key" && echo "yes" || echo "no") - - if [[ "$HAS_CERT" == "yes" && "$HAS_KEY" == "yes" ]]; then - print_success " TLS Secret: $SECRET_NAME (valid for hosts: $HOSTS)" - - # Optional: Check certificate expiration (requires openssl) - if command -v openssl &>/dev/null; then - CERT_DATA=$(kubectl get secret "$SECRET_NAME" -n "$NAMESPACE" -o jsonpath='{.data.tls\.crt}' 2>/dev/null | base64 -d 2>/dev/null) - - if [[ -n "$CERT_DATA" ]]; then - EXPIRY=$(echo "$CERT_DATA" | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) - - if [[ -n "$EXPIRY" ]]; then - print_info " Certificate expires: $EXPIRY" - - # Check if certificate is expired or expiring soon (30 days) - EXPIRY_EPOCH=$(date -d "$EXPIRY" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$EXPIRY" +%s 2>/dev/null) - CURRENT_EPOCH=$(date +%s) - DAYS_UNTIL_EXPIRY=$(( ($EXPIRY_EPOCH - $CURRENT_EPOCH) / 86400 )) - - if [[ $DAYS_UNTIL_EXPIRY -lt 0 ]]; then - HAS_ISSUES=1 - print_error " Certificate has EXPIRED" - elif [[ $DAYS_UNTIL_EXPIRY -lt 30 ]]; then - print_warning " Certificate expires in $DAYS_UNTIL_EXPIRY days" - fi + # Note: build_context strips .data for security, so we cannot inspect cert/key keys here. + # The Secret type kubernetes.io/tls inherently requires tls.crt and tls.key — k8s validates + # this at creation time. So we trust the type and validate the rest live if needed. + print_success " TLS Secret: $SECRET_NAME (valid for hosts: $HOSTS)" + + EXPIRY_INFO="null" + DAYS_UNTIL_EXPIRY="" + + if command -v openssl &>/dev/null; then + CERT_DATA=$(kubectl get secret "$SECRET_NAME" -n "$NAMESPACE" -o jsonpath='{.data.tls\.crt}' 2>/dev/null | base64 -d 2>/dev/null) + if [[ -n "$CERT_DATA" ]]; then + EXPIRY=$(echo "$CERT_DATA" | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) + if [[ -n "$EXPIRY" ]]; then + print_info " Certificate expires: $EXPIRY" + EXPIRY_EPOCH=$(date -d "$EXPIRY" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$EXPIRY" +%s 2>/dev/null) + CURRENT_EPOCH=$(date +%s) + DAYS_UNTIL_EXPIRY=$(( (EXPIRY_EPOCH - CURRENT_EPOCH) / 86400 )) + EXPIRY_INFO=$(jq -nc --arg expiry "$EXPIRY" --argjson days "$DAYS_UNTIL_EXPIRY" \ + '{expires: $expiry, days_remaining: $days}') + + if [[ $DAYS_UNTIL_EXPIRY -lt 0 ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Certificate has EXPIRED" + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg secret "$SECRET_NAME" \ + '{ingress: $ing, secret: $secret, issue: "certificate_expired"}') + add_fact ISSUE_FACTS "$ISSUE" + elif [[ $DAYS_UNTIL_EXPIRY -lt 30 ]]; then + print_warning " Certificate expires in $DAYS_UNTIL_EXPIRY days" fi fi fi - else - HAS_ISSUES=1 - print_error " TLS Secret: $SECRET_NAME missing required keys (needs tls.crt and tls.key)" fi + + SECRET_FACT=$(jq -nc --arg secret "$SECRET_NAME" --arg hosts "$HOSTS" --arg type "$SECRET_TYPE" \ + --argjson expiry "$EXPIRY_INFO" \ + '{secret: $secret, hosts: ($hosts | split(",")), type: $type, valid: true, certificate: $expiry}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " TLS Secret: $SECRET_NAME has wrong type '$SECRET_TYPE' (expected kubernetes.io/tls)" + SECRET_FACT=$(jq -nc --arg secret "$SECRET_NAME" --arg type "$SECRET_TYPE" \ + '{secret: $secret, type: $type, valid: false, issue: "wrong_secret_type"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg secret "$SECRET_NAME" --arg type "$SECRET_TYPE" \ + '{ingress: $ing, secret: $secret, secret_type: $type, issue: "wrong_secret_type"}') + add_fact ISSUE_FACTS "$ISSUE" fi else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " TLS Secret: '$SECRET_NAME' not found in namespace" print_action "Create TLS secret or update ingress configuration" + SECRET_FACT=$(jq -nc --arg secret "$SECRET_NAME" \ + '{secret: $secret, valid: false, issue: "secret_not_found"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg secret "$SECRET_NAME" \ + '{ingress: $ing, secret: $secret, issue: "secret_not_found"}') + add_fact ISSUE_FACTS "$ISSUE" fi + add_fact SECRETS_INFO "$SECRET_FACT" done < <(echo "$TLS_SECRETS") + + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --argjson secrets "$SECRETS_INFO" \ + '{ingress: $ing, tls_configured: true, secrets: $secrets}') + add_fact TLS_FACTS "$FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "TLS configuration valid for all $INGRESS_COUNT ingress(es)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "TLS configuration valid for all $INGRESS_COUNT ingress(es)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array TLS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have TLS issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array TLS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Create or fix TLS secrets and ensure they are of type kubernetes.io/tls"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/scope/container_crash_detection b/k8s/diagnose/scope/container_crash_detection index 8a8121c3..04618b7a 100644 --- a/k8s/diagnose/scope/container_crash_detection +++ b/k8s/diagnose/scope/container_crash_detection @@ -8,70 +8,140 @@ require_pods || return 0 # Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_CRASHES=0 +CRASH_LOOP_FACTS=() +TERMINATED_FACTS=() +HIGH_RESTART_FACTS=() +AFFECTED_PODS="" + +HAS_ACTION_CHECK_LOGS=0 +HAS_ACTION_CHECK_TERMINATION=0 +HAS_ACTION_CHECK_INTERMITTENT=0 + +NUM_OOM=0 +NUM_APP_ERROR=0 for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) - # Check for containers in crash states + # ----- CrashLoopBackOff ----- CRASH_LOOP=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.waiting.reason == "CrashLoopBackOff") | .name') if [[ -n "$CRASH_LOOP" ]]; then - HAS_CRASHES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: CrashLoopBackOff in container(s): $CRASH_LOOP" for CONTAINER in $CRASH_LOOP; do RESTART_COUNT=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .restartCount") EXIT_CODE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .lastState.terminated.exitCode // \"N/A\"") TERMINATION_REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .lastState.terminated.reason // \"Unknown\"") + EXIT_MEANING=$(exit_code_meaning "$EXIT_CODE") print_warning " Container: $CONTAINER | Restarts: $RESTART_COUNT | Exit Code: $EXIT_CODE | Reason: $TERMINATION_REASON" - - case "$EXIT_CODE" in - 137) print_warning " Exit 137 = OOMKilled (out of memory)" ;; - 143) print_warning " Exit 143 = SIGTERM (graceful termination)" ;; - 1) print_warning " Exit 1 = Application error" ;; - 139) print_warning " Exit 139 = SIGSEGV (segmentation fault)" ;; - esac + [[ "$EXIT_MEANING" != "Unknown" ]] && print_warning " Exit $EXIT_CODE = $EXIT_MEANING" + + [[ "$EXIT_CODE" == "137" ]] && NUM_OOM=$((NUM_OOM + 1)) + [[ "$EXIT_CODE" == "1" ]] && NUM_APP_ERROR=$((NUM_APP_ERROR + 1)) + + # CrashLoopBackOff: the useful logs are from the previous container + # instance. The current instance is in waiting state with no logs yet. + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "previous") + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "current") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --argjson restarts "${RESTART_COUNT:-0}" \ + --arg exit_code "$EXIT_CODE" \ + --arg exit_meaning "$EXIT_MEANING" \ + --arg reason "$TERMINATION_REASON" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + --argjson current_logs "$CURRENT_LOGS" \ + '{ + pod: $pod, + container: $container, + restart_count: $restarts, + exit_code: $exit_code, + exit_code_meaning: $exit_meaning, + termination_reason: $reason, + previous_logs: $previous_logs, + current_logs: $current_logs + }') + add_fact CRASH_LOOP_FACTS "$FACT" done print_info "Last logs from $POD_NAME:" - kubectl logs "$POD_NAME" -n "$NAMESPACE" --tail=10 2>&1 | sed 's/^/ /' + # Logs are now pre-collected by build_context for problematic pods. + # Fall back to a live kubectl call only if the snapshot is missing. + PRINTED_LOGS=0 + for CONTAINER in $CRASH_LOOP; do + LOG_FILE="$POD_LOGS_DIR/${POD_NAME}.${CONTAINER}.log" + if [[ -s "$LOG_FILE" ]]; then + tail -n 10 "$LOG_FILE" | sed "s|^| [$CONTAINER] |" + PRINTED_LOGS=1 + fi + done + if [[ $PRINTED_LOGS -eq 0 ]]; then + kubectl logs "$POD_NAME" -n "$NAMESPACE" --tail=10 2>&1 | sed 's/^/ /' + fi print_action "Check container logs and fix application startup issues" + HAS_ACTION_CHECK_LOGS=1 fi - # Check for containers that terminated but haven't restarted yet + # ----- Terminated (state.terminated) ----- TERMINATED_CONTAINERS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.terminated) | .name') if [[ -n "$TERMINATED_CONTAINERS" ]]; then - HAS_CRASHES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: Terminated container(s): $TERMINATED_CONTAINERS" for CONTAINER in $TERMINATED_CONTAINERS; do EXIT_CODE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.terminated.exitCode // \"N/A\"") TERMINATION_REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.terminated.reason // \"Unknown\"") RESTART_COUNT=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .restartCount") + EXIT_MEANING=$(exit_code_meaning "$EXIT_CODE") print_warning " Container: $CONTAINER | Exit Code: $EXIT_CODE | Reason: $TERMINATION_REASON | Restarts: $RESTART_COUNT" - - case "$EXIT_CODE" in - 137) print_warning " Exit 137 = OOMKilled (out of memory)" ;; - 143) print_warning " Exit 143 = SIGTERM (graceful termination)" ;; - 1) print_warning " Exit 1 = Application error" ;; - 139) print_warning " Exit 139 = SIGSEGV (segmentation fault)" ;; - 0) print_info " Exit 0 = Clean exit (container finished successfully)" ;; - esac + [[ "$EXIT_MEANING" != "Unknown" ]] && print_warning " Exit $EXIT_CODE = $EXIT_MEANING" + + [[ "$EXIT_CODE" == "137" ]] && NUM_OOM=$((NUM_OOM + 1)) + [[ "$EXIT_CODE" == "1" ]] && NUM_APP_ERROR=$((NUM_APP_ERROR + 1)) + + # state.terminated (not waiting): current logs are from the still- + # terminated instance; previous matters if the container restarted before. + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "current") + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "previous") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --argjson restarts "${RESTART_COUNT:-0}" \ + --arg exit_code "$EXIT_CODE" \ + --arg exit_meaning "$EXIT_MEANING" \ + --arg reason "$TERMINATION_REASON" \ + --argjson current_logs "$CURRENT_LOGS" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + '{ + pod: $pod, + container: $container, + restart_count: $restarts, + exit_code: $exit_code, + exit_code_meaning: $exit_meaning, + termination_reason: $reason, + current_logs: $current_logs, + previous_logs: $previous_logs + }') + add_fact TERMINATED_FACTS "$FACT" done print_action "Check why container terminated and review logs" + HAS_ACTION_CHECK_TERMINATION=1 fi - # Check for containers with high restart counts (even if currently running) + # ----- High restart count (currently running but unstable) ----- HIGH_RESTART_CONTAINERS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.restartCount >= 3) | "\(.name):\(.restartCount)"') if [[ -n "$HIGH_RESTART_CONTAINERS" ]]; then - HAS_CRASHES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_warning "Pod $POD_NAME: Container(s) with high restart count:" while IFS= read -r CONTAINER_INFO; do @@ -80,18 +150,98 @@ for POD_NAME in $PODS; do LAST_EXIT_CODE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER_NAME\") | .lastState.terminated.exitCode // \"N/A\"") LAST_REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER_NAME\") | .lastState.terminated.reason // \"Unknown\"") + LAST_EXIT_MEANING=$(exit_code_meaning "$LAST_EXIT_CODE") print_warning " Container: $CONTAINER_NAME | Restarts: $RESTART_COUNT | Last Exit: $LAST_EXIT_CODE | Reason: $LAST_REASON" + + # High restart count: container running OK now but crashed multiple + # times before. Previous logs are from the most recent crash. + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER_NAME" "previous") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER_NAME" \ + --argjson restarts "${RESTART_COUNT:-0}" \ + --arg last_exit "$LAST_EXIT_CODE" \ + --arg last_meaning "$LAST_EXIT_MEANING" \ + --arg last_reason "$LAST_REASON" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + '{ + pod: $pod, + container: $container, + restart_count: $restarts, + last_exit_code: $last_exit, + last_exit_code_meaning: $last_meaning, + last_termination_reason: $last_reason, + previous_logs: $previous_logs + }') + add_fact HIGH_RESTART_FACTS "$FACT" done <<< "$HIGH_RESTART_CONTAINERS" print_action "Container has restarted multiple times - check for intermittent issues" + HAS_ACTION_CHECK_INTERMITTENT=1 fi done -if [[ $HAS_CRASHES -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(jq '.items | length' "$PODS_FILE") +NUM_CRASH_LOOP=${#CRASH_LOOP_FACTS[@]} +NUM_TERMINATED=${#TERMINATED_FACTS[@]} +NUM_HIGH_RESTART=${#HIGH_RESTART_FACTS[@]} +TOTAL_FINDINGS=$((NUM_CRASH_LOOP + NUM_TERMINATED + NUM_HIGH_RESTART)) +AFFECTED_COUNT=$(echo $AFFECTED_PODS | wc -w | tr -d ' ') + +CRASH_LOOP_FACTS_JSON=$(facts_to_json_array CRASH_LOOP_FACTS) +TERMINATED_FACTS_JSON=$(facts_to_json_array TERMINATED_FACTS) +HIGH_RESTART_FACTS_JSON=$(facts_to_json_array HIGH_RESTART_FACTS) +AFFECTED_PODS_JSON=$(set_to_json_array AFFECTED_PODS) + +if [[ $TOTAL_FINDINGS -eq 0 ]]; then print_success "All $POD_COUNT pod(s) running without crashes or errors" - update_check_result --status "success" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "All $POD_COUNT pod(s) running without crashes or errors" \ + "info" \ + "[]" \ + "$(jq -nc --argjson total "$POD_COUNT" '{pods_checked: $total}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + SUMMARY_PARTS=() + [[ $NUM_OOM -gt 0 ]] && SUMMARY_PARTS+=("$NUM_OOM OOMKilled") + [[ $NUM_APP_ERROR -gt 0 ]] && SUMMARY_PARTS+=("$NUM_APP_ERROR app error") + [[ $NUM_CRASH_LOOP -gt 0 ]] && SUMMARY_PARTS+=("$NUM_CRASH_LOOP CrashLoopBackOff") + [[ $NUM_TERMINATED -gt 0 ]] && SUMMARY_PARTS+=("$NUM_TERMINATED terminated") + [[ $NUM_HIGH_RESTART -gt 0 ]] && SUMMARY_PARTS+=("$NUM_HIGH_RESTART high-restart") + SUMMARY_DETAIL=$(IFS=", "; echo "${SUMMARY_PARTS[*]}") + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) crashing — $SUMMARY_DETAIL" + + ACTIONS_ARR=() + [[ $HAS_ACTION_CHECK_LOGS -eq 1 ]] && ACTIONS_ARR+=("Check container logs and fix application startup issues") + [[ $HAS_ACTION_CHECK_TERMINATION -eq 1 ]] && ACTIONS_ARR+=("Check why container terminated and review logs") + [[ $HAS_ACTION_CHECK_INTERMITTENT -eq 1 ]] && ACTIONS_ARR+=("Container has restarted multiple times - check for intermittent issues") + ACTIONS_JSON=$(printf '%s\n' "${ACTIONS_ARR[@]}" | lines_to_json_array) + + DETAILS=$(jq -nc \ + --argjson crash_loop "$CRASH_LOOP_FACTS_JSON" \ + --argjson terminated "$TERMINATED_FACTS_JSON" \ + --argjson high_restart "$HIGH_RESTART_FACTS_JSON" \ + --argjson pod_count "$POD_COUNT" \ + --argjson oom_count "$NUM_OOM" \ + --argjson app_error_count "$NUM_APP_ERROR" \ + '{ + pod_count: $pod_count, + counts: { + crash_loop_back_off: ($crash_loop | length), + terminated: ($terminated | length), + high_restart: ($high_restart | length), + oom_killed: $oom_count, + application_error: $app_error_count + }, + crash_loop_back_off: $crash_loop, + terminated: $terminated, + high_restart: $high_restart + }') + + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$AFFECTED_PODS_JSON" "$DETAILS" "$ACTIONS_JSON") + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/container_port_health b/k8s/diagnose/scope/container_port_health index 78152ee9..37cdc344 100755 --- a/k8s/diagnose/scope/container_port_health +++ b/k8s/diagnose/scope/container_port_health @@ -2,52 +2,54 @@ # Check: Container Port Health # Validates that containers are actually listening on their declared ports -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_PORT_ISSUES=0 CONTAINERS_TESTED=0 CONTAINERS_SKIPPED=0 +ISSUE_FACTS=() +POD_FACTS=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) - # Check if pod is running POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') if [[ "$POD_PHASE" != "Running" ]]; then print_warning "Pod $POD_NAME: Not running (phase: $POD_PHASE), skipping port checks" + FACT=$(jq -nc --arg p "$POD_NAME" --arg phase "$POD_PHASE" '{pod: $p, status: "skipped", reason: "not_running", phase: $phase}') + add_fact POD_FACTS "$FACT" continue fi - # Get pod IP POD_IP=$(echo "$POD_INFO" | jq -r '.status.podIP') if [[ -z "$POD_IP" || "$POD_IP" == "null" ]]; then print_warning "Pod $POD_NAME: No IP assigned, skipping port checks" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, status: "skipped", reason: "no_ip"}') + add_fact POD_FACTS "$FACT" continue fi print_info "Checking pod $POD_NAME:" - # Get all containers with their ports CONTAINERS=$(echo "$POD_INFO" | jq -r '.spec.containers[] | @base64') + POD_CONTAINER_FACTS=() for CONTAINER_B64 in $CONTAINERS; do CONTAINER_DATA=$(echo "$CONTAINER_B64" | base64 -d) CONTAINER_NAME=$(echo "$CONTAINER_DATA" | jq -r '.name') - # Check container status before testing ports CONTAINER_STATUS=$(echo "$POD_INFO" | jq -r --arg name "$CONTAINER_NAME" '.status.containerStatuses[]? | select(.name == $name)') if [[ -z "$CONTAINER_STATUS" ]]; then print_warning " Container '$CONTAINER_NAME': Status not found, skipping" + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" '{container: $c, status: "skipped", reason: "no_status"}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue fi - # Check if container is ready CONTAINER_READY=$(echo "$CONTAINER_STATUS" | jq -r '.ready') CONTAINER_STATE=$(echo "$CONTAINER_STATUS" | jq -r ' if .state.running then "running" @@ -57,28 +59,26 @@ for POD_NAME in $PODS; do end ') - # Get declared ports for this container CONTAINER_PORTS=$(echo "$CONTAINER_DATA" | jq -r '.ports[]? | .containerPort' | tr '\n' ' ') if [[ -z "$CONTAINER_PORTS" ]]; then print_info " Container '$CONTAINER_NAME': No ports declared" + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" '{container: $c, status: "no_ports_declared"}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue fi print_info " Container '$CONTAINER_NAME':" - # If container is not running, explain why we can't test ports if [[ "$CONTAINER_STATE" != "running" ]]; then if [[ "$CONTAINER_STATE" == "waiting" ]]; then WAITING_REASON=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.reason // "Unknown"') WAITING_MESSAGE=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.message // ""') - # Check if it's a normal startup state or a problem case "$WAITING_REASON" in ContainerCreating|PodInitializing|Pulling) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) print_info " Container is starting ($WAITING_REASON) - skipping port checks" - continue ;; CrashLoopBackOff|ImagePullBackOff|ErrImagePull) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) @@ -87,37 +87,46 @@ for POD_NAME in $PODS; do print_warning " Message: $WAITING_MESSAGE" fi print_action "Fix container startup issues (check container_crash_detection results)" - continue ;; *) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) print_warning " Container waiting: $WAITING_REASON - skipping port checks" - continue ;; esac + + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" --arg reason "$WAITING_REASON" \ + '{container: $c, status: "skipped", state: "waiting", reason: $reason}') + add_fact POD_CONTAINER_FACTS "$CFACT" + continue elif [[ "$CONTAINER_STATE" == "terminated" ]]; then EXIT_CODE=$(echo "$CONTAINER_STATUS" | jq -r '.state.terminated.exitCode // "N/A"') TERMINATION_REASON=$(echo "$CONTAINER_STATUS" | jq -r '.state.terminated.reason // "Unknown"') CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) print_warning " Cannot test ports - container terminated (Exit: $EXIT_CODE, Reason: $TERMINATION_REASON)" print_action "Fix container termination (check container_crash_detection results)" + + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" --arg ec "$EXIT_CODE" --arg reason "$TERMINATION_REASON" \ + '{container: $c, status: "skipped", state: "terminated", exit_code: $ec, termination_reason: $reason}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue else print_warning " Container in unknown state - skipping port checks" + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" '{container: $c, status: "skipped", state: "unknown"}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue fi fi - # Container is running - check if it's ready if [[ "$CONTAINER_READY" != "true" ]]; then print_warning " Container is running but not ready - port connectivity may fail" fi - # Test connectivity to each declared port from agent CONTAINERS_TESTED=$((CONTAINERS_TESTED + 1)) + PORT_RESULTS=() + CONTAINER_HAS_PORT_ISSUE=0 + for PORT in $CONTAINER_PORTS; do - # Try nc first, then timeout + /dev/tcp, then curl if command -v nc >/dev/null 2>&1; then timeout 2 nc -z -w 1 "$POD_IP" "$PORT" >/dev/null 2>&1 CONNECTIVITY_EXIT_CODE=$? @@ -129,33 +138,83 @@ for POD_NAME in $PODS; do CONNECTIVITY_EXIT_CODE=$? else print_warning " Port $PORT: Cannot test (nc/timeout/curl not available in agent)" + PORT_RESULTS=$(echo "$PORT_RESULTS" | jq --argjson port "$PORT" '. + [{port: $port, status: "untestable"}]') continue fi if [[ $CONNECTIVITY_EXIT_CODE -eq 0 ]]; then print_success " Port $PORT: ✓ Listening" + PORT_RESULTS=$(echo "$PORT_RESULTS" | jq --argjson port "$PORT" '. + [{port: $port, status: "listening"}]') else - HAS_PORT_ISSUES=1 + CONTAINER_HAS_PORT_ISSUE=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error " Port $PORT: ✗ Declared but not listening or unreachable" print_action "Check application configuration and ensure it listens on port $PORT" + PORT_RESULTS=$(echo "$PORT_RESULTS" | jq --argjson port "$PORT" '. + [{port: $port, status: "not_listening"}]') + + # The container is running but not listening on its declared + # port — current logs likely show why (binding error, config + # mismatch, app stuck during startup). + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER_NAME" "current") + + ISSUE=$(jq -nc --arg pod "$POD_NAME" --arg c "$CONTAINER_NAME" --argjson port "$PORT" \ + --argjson logs "$CURRENT_LOGS" \ + '{pod: $pod, container: $c, port: $port, issue: "port_not_listening", container_logs: $logs}') + add_fact ISSUE_FACTS "$ISSUE" fi done + + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" --argjson ports "$(facts_to_json_array PORT_RESULTS)" --arg ready "$CONTAINER_READY" \ + --argjson tested true \ + '{container: $c, status: "tested", container_ready: ($ready == "true"), ports: $ports}') + add_fact POD_CONTAINER_FACTS "$CFACT" done + + POD_FACT=$(jq -nc --arg p "$POD_NAME" --arg ip "$POD_IP" --argjson containers "$(facts_to_json_array POD_CONTAINER_FACTS)" \ + '{pod: $p, pod_ip: $ip, containers: $containers}') + add_fact POD_FACTS "$POD_FACT" done echo "" + +DETAILS=$(jq -nc \ + --argjson tested "$CONTAINERS_TESTED" \ + --argjson skipped "$CONTAINERS_SKIPPED" \ + --argjson facts "$(facts_to_json_array POD_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + '{ + containers_tested: $tested, + containers_skipped: $skipped, + issue_count: ($issues | length), + pods: $facts, + issues: $issues + }') + if [[ $CONTAINERS_TESTED -eq 0 ]]; then - # No containers were tested - all were skipped print_info "All containers skipped - no port checks could be performed" - update_check_result --status "skipped" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" -elif [[ $HAS_PORT_ISSUES -eq 0 ]]; then - # Some/all containers were tested and all passed + EVIDENCE=$(evidence_json \ + "All containers skipped — no port checks performed" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "skipped" --evidence "$EVIDENCE" +elif [[ $(echo "$ISSUE_FACTS" | jq 'length') -eq 0 ]]; then print_success "Port connectivity verified on $CONTAINERS_TESTED container(s)" - update_check_result --status "success" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + EVIDENCE=$(evidence_json \ + "Port connectivity verified on $CONTAINERS_TESTED container(s)" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - # Some containers were tested and had issues if [[ $CONTAINERS_SKIPPED -gt 0 ]]; then print_warning "Port issues found ($CONTAINERS_TESTED tested, $CONTAINERS_SKIPPED skipped)" fi - update_check_result --status "failed" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT pod(s) with port connectivity issues" + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Check application configuration and ensure it listens on declared ports"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/health_probe_endpoints b/k8s/diagnose/scope/health_probe_endpoints index a7bfd2c6..b8a32671 100755 --- a/k8s/diagnose/scope/health_probe_endpoints +++ b/k8s/diagnose/scope/health_probe_endpoints @@ -2,10 +2,8 @@ # Check: Health Probe Endpoints # Validates that liveness and readiness probe endpoints are configured and responding correctly -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') HAS_PROBE_ISSUES=0 @@ -13,27 +11,122 @@ HAS_PROBE_WARNINGS=0 CONTAINERS_TESTED=0 CONTAINERS_SKIPPED=0 +PROBE_FACTS=() +ISSUE_FACTS=() +POD_FACTS=() +AFFECTED_PODS="" + + +# Probe a single HTTP endpoint and emit a JSON fact + classification +# Args: pod, container, probe_type (Readiness/Liveness/Startup), path, port, scheme +test_http_probe() { + local pod="$1" container="$2" probe_type="$3" path="$4" port="$5" scheme="$6" + local url="${scheme,,}://$POD_IP:$port$path" + local response="" exit_code=0 + + if command -v curl >/dev/null 2>&1; then + if [[ "${scheme^^}" == "HTTPS" ]]; then + response=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$url" 2>&1) + else + response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$url" 2>&1) + fi + exit_code=$? + elif command -v wget >/dev/null 2>&1; then + if [[ "${scheme^^}" == "HTTPS" ]]; then + response=$(wget --no-check-certificate -O /dev/null --timeout=2 "$url" 2>&1) + else + response=$(wget -O /dev/null --timeout=2 "$url" 2>&1) + fi + exit_code=$? + if [[ $exit_code -eq 0 ]]; then + response="200" + else + local err + err=$(echo "$response" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) + if [[ -n "$err" ]]; then + response=$(echo "$err" | cut -c1-80) + else + response="wget failed with exit code $exit_code" + fi + fi + else + print_warning " $probe_type Probe on $scheme://$port$path: Cannot test (curl/wget not available in agent)" + PROBE_FACT=$(jq -nc --arg pod "$pod" --arg c "$container" --arg pt "$probe_type" \ + --arg path "$path" --arg port "$port" --arg scheme "$scheme" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "httpGet", path: $path, port: $port, scheme: $scheme, status: "untestable"}') + add_fact PROBE_FACTS "$PROBE_FACT" + return 2 + fi + + local result_status="" classification="" + if [[ $exit_code -eq 0 && "$response" =~ ^[2-3][0-9][0-9]$ ]]; then + print_success " $probe_type Probe on $scheme://$port$path: ✓ HTTP $response" + result_status="ok" + classification="success" + elif [[ "$response" =~ ^4[0-9][0-9]$ ]]; then + HAS_PROBE_ISSUES=1 + print_error " $probe_type Probe on $scheme://$port$path: ✗ HTTP $response - Health check endpoint not found" + result_status="endpoint_not_found" + classification="critical_4xx" + elif [[ "$response" =~ ^5[0-9][0-9]$ ]]; then + HAS_PROBE_WARNINGS=1 + print_warning " $probe_type Probe on $scheme://$port$path: ⚠ HTTP $response - Application error" + result_status="application_error" + classification="warning_5xx" + else + HAS_PROBE_WARNINGS=1 + print_warning " $probe_type Probe on $scheme://$port$path: ⚠ Connection failed (response: $response, exit code: $exit_code)" + result_status="connection_failed" + classification="warning_connection" + fi + + # On failure, attach the container's current logs to the issue so the AI + # summarizer can correlate "probe says X" with "app logs say Y" without + # any extra calls. + local container_logs="[]" + if [[ "$classification" != "success" ]]; then + container_logs=$(read_log_tail "$pod" "$container" "current") + fi + + PROBE_FACT=$(jq -nc --arg pod "$pod" --arg c "$container" --arg pt "$probe_type" \ + --arg path "$path" --arg port "$port" --arg scheme "$scheme" \ + --arg http_response "$response" --arg status "$result_status" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "httpGet", path: $path, port: $port, scheme: $scheme, http_response: $http_response, status: $status}') + add_fact PROBE_FACTS "$PROBE_FACT" + + if [[ "$classification" != "success" ]]; then + ISSUE=$(jq -nc --arg pod "$pod" --arg c "$container" --arg pt "$probe_type" \ + --arg path "$path" --arg port "$port" --arg http_response "$response" \ + --arg classification "$classification" \ + --argjson logs "$container_logs" \ + '{pod: $pod, container: $c, probe_type: $pt, path: $path, port: $port, http_response: $http_response, issue: $classification, container_logs: $logs}') + add_fact ISSUE_FACTS "$ISSUE" + return 1 + fi + return 0 +} + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) - # Check if pod is running POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') if [[ "$POD_PHASE" != "Running" ]]; then print_warning "Pod $POD_NAME: Not running (phase: $POD_PHASE), skipping probe checks" + FACT=$(jq -nc --arg p "$POD_NAME" --arg phase "$POD_PHASE" '{pod: $p, status: "skipped", reason: "not_running", phase: $phase}') + add_fact POD_FACTS "$FACT" continue fi - # Get pod IP POD_IP=$(echo "$POD_INFO" | jq -r '.status.podIP') if [[ -z "$POD_IP" || "$POD_IP" == "null" ]]; then print_warning "Pod $POD_NAME: No IP assigned, skipping probe checks" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, status: "skipped", reason: "no_ip"}') + add_fact POD_FACTS "$FACT" continue fi print_info "Checking pod $POD_NAME:" - # Get all containers CONTAINERS=$(echo "$POD_INFO" | jq -r '.spec.containers[] | @base64') for CONTAINER_B64 in $CONTAINERS; do @@ -42,7 +135,6 @@ for POD_NAME in $PODS; do print_info " Container '$CONTAINER_NAME':" - # Check container status before testing probes CONTAINER_STATUS=$(echo "$POD_INFO" | jq -r --arg name "$CONTAINER_NAME" '.status.containerStatuses[]? | select(.name == $name)') if [[ -z "$CONTAINER_STATUS" ]]; then @@ -50,7 +142,6 @@ for POD_NAME in $PODS; do continue fi - # Check if container is ready CONTAINER_READY=$(echo "$CONTAINER_STATUS" | jq -r '.ready') CONTAINER_STATE=$(echo "$CONTAINER_STATUS" | jq -r ' if .state.running then "running" @@ -60,13 +151,11 @@ for POD_NAME in $PODS; do end ') - # If container is not running, explain why we can't test probes if [[ "$CONTAINER_STATE" != "running" ]]; then if [[ "$CONTAINER_STATE" == "waiting" ]]; then WAITING_REASON=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.reason // "Unknown"') WAITING_MESSAGE=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.message // ""') - # Check if it's a normal startup state or a problem case "$WAITING_REASON" in ContainerCreating|PodInitializing|Pulling) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) @@ -101,12 +190,10 @@ for POD_NAME in $PODS; do fi fi - # Container is running - check if it's ready if [[ "$CONTAINER_READY" != "true" ]]; then print_info " Container is running but not ready - probe checks may show why" fi - # Check if container has any probes configured HAS_READINESS=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe // empty') HAS_LIVENESS=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe // empty') HAS_STARTUP=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe // empty') @@ -116,286 +203,97 @@ for POD_NAME in $PODS; do continue fi - # Container has probes and is testable CONTAINERS_TESTED=$((CONTAINERS_TESTED + 1)) - # Track issues for this container to avoid repetitive action messages - CONTAINER_HAS_CONNECTION_ISSUES=0 - CONTAINER_HAS_4XX_ISSUES=0 - CONTAINER_HAS_5XX_ISSUES=0 - FAILED_PROBES_LIST="" - - # Check Readiness Probe - if [[ -n "$HAS_READINESS" ]]; then - PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r 'if .readinessProbe.httpGet then "httpGet" elif .readinessProbe.tcpSocket then "tcpSocket" elif .readinessProbe.exec then "exec" else "unknown" end') + # Test each probe type that's configured + for PROBE_KIND in readinessProbe livenessProbe startupProbe; do + local_has_probe="" + case "$PROBE_KIND" in + readinessProbe) local_has_probe="$HAS_READINESS"; PROBE_LABEL="Readiness" ;; + livenessProbe) local_has_probe="$HAS_LIVENESS"; PROBE_LABEL="Liveness" ;; + startupProbe) local_has_probe="$HAS_STARTUP"; PROBE_LABEL="Startup" ;; + esac - if [[ "$PROBE_TYPE" == "httpGet" ]]; then - PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.httpGet.path') - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.httpGet.port') - PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.httpGet.scheme // "HTTP"') - PROBE_URL="${PROBE_SCHEME,,}://$POD_IP:$PROBE_PORT$PROBE_PATH" - - # Try curl first from agent, then wget - if command -v curl >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - elif command -v wget >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(wget --no-check-certificate -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(wget -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - # Parse wget output to extract HTTP status or error - if [[ $PROBE_EXIT_CODE -eq 0 ]]; then - PROBE_RESPONSE="200" - else - # Extract error from wget output - try multiple patterns - ERROR_MSG=$(echo "$PROBE_RESPONSE" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) - if [[ -n "$ERROR_MSG" ]]; then - # Shorten the message if too long - PROBE_RESPONSE=$(echo "$ERROR_MSG" | cut -c1-80) - else - # If no specific error found, show exit code - PROBE_RESPONSE="wget failed with exit code $PROBE_EXIT_CODE" - fi - fi - else - print_warning " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: Cannot test (curl/wget not available in agent)" - continue - fi - - if [[ $PROBE_EXIT_CODE -eq 0 && "$PROBE_RESPONSE" =~ ^[2-3][0-9][0-9]$ ]]; then - print_success " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✓ HTTP $PROBE_RESPONSE" - else - # Probe failed - check if it's config issue or app issue - if [[ "$PROBE_RESPONSE" =~ ^4[0-9][0-9]$ ]]; then - # 4xx error: endpoint not found or bad config - HAS_PROBE_ISSUES=1 - CONTAINER_HAS_4XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Readiness" - print_error " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✗ HTTP $PROBE_RESPONSE - Health check endpoint not found" - elif [[ "$PROBE_RESPONSE" =~ ^5[0-9][0-9]$ ]]; then - # 5xx error: app has internal issues - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_5XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Readiness" - print_warning " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ HTTP $PROBE_RESPONSE - Application error" - else - # Connection failed or other error (port not listening, network issue, etc) - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_CONNECTION_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Readiness" - print_warning " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ Connection failed (response: $PROBE_RESPONSE, exit code: $PROBE_EXIT_CODE)" - fi - fi - elif [[ "$PROBE_TYPE" == "tcpSocket" ]]; then - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.tcpSocket.port') - print_info " Readiness Probe: TCP Socket on port $PROBE_PORT (tested in port health check)" - elif [[ "$PROBE_TYPE" == "exec" ]]; then - PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.exec.command | join(" ")') - print_info " Readiness Probe: Exec [$PROBE_COMMAND] (cannot test directly)" - fi - fi + [[ -z "$local_has_probe" ]] && continue - # Check Liveness Probe - if [[ -n "$HAS_LIVENESS" ]]; then - PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r 'if .livenessProbe.httpGet then "httpGet" elif .livenessProbe.tcpSocket then "tcpSocket" elif .livenessProbe.exec then "exec" else "unknown" end') + PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" 'if .[$p].httpGet then "httpGet" elif .[$p].tcpSocket then "tcpSocket" elif .[$p].exec then "exec" else "unknown" end') if [[ "$PROBE_TYPE" == "httpGet" ]]; then - PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.httpGet.path') - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.httpGet.port') - PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.httpGet.scheme // "HTTP"') - PROBE_URL="${PROBE_SCHEME,,}://$POD_IP:$PROBE_PORT$PROBE_PATH" - - # Try curl first from agent, then wget - if command -v curl >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - elif command -v wget >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(wget --no-check-certificate -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(wget -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - # Parse wget output to extract HTTP status or error - if [[ $PROBE_EXIT_CODE -eq 0 ]]; then - PROBE_RESPONSE="200" - else - # Extract error from wget output - try multiple patterns - ERROR_MSG=$(echo "$PROBE_RESPONSE" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) - if [[ -n "$ERROR_MSG" ]]; then - # Shorten the message if too long - PROBE_RESPONSE=$(echo "$ERROR_MSG" | cut -c1-80) - else - # If no specific error found, show exit code - PROBE_RESPONSE="wget failed with exit code $PROBE_EXIT_CODE" - fi - fi - else - print_warning " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: Cannot test (curl/wget not available in agent)" - continue - fi - - if [[ $PROBE_EXIT_CODE -eq 0 && "$PROBE_RESPONSE" =~ ^[2-3][0-9][0-9]$ ]]; then - print_success " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✓ HTTP $PROBE_RESPONSE" - else - # Probe failed - check if it's config issue or app issue - if [[ "$PROBE_RESPONSE" =~ ^4[0-9][0-9]$ ]]; then - # 4xx error: endpoint not found or bad config - HAS_PROBE_ISSUES=1 - CONTAINER_HAS_4XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Liveness" - print_error " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✗ HTTP $PROBE_RESPONSE - Health check endpoint not found" - elif [[ "$PROBE_RESPONSE" =~ ^5[0-9][0-9]$ ]]; then - # 5xx error: app has internal issues - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_5XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Liveness" - print_warning " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ HTTP $PROBE_RESPONSE - Application error" - else - # Connection failed or other error (port not listening, network issue, etc) - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_CONNECTION_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Liveness" - print_warning " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ Connection failed (response: $PROBE_RESPONSE, exit code: $PROBE_EXIT_CODE)" - fi + PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].httpGet.path') + PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].httpGet.port') + PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].httpGet.scheme // "HTTP"') + + test_http_probe "$POD_NAME" "$CONTAINER_NAME" "$PROBE_LABEL" "$PROBE_PATH" "$PROBE_PORT" "$PROBE_SCHEME" + rc=$? + if [[ $rc -eq 1 ]]; then + mark_affected AFFECTED_PODS "$POD_NAME" fi elif [[ "$PROBE_TYPE" == "tcpSocket" ]]; then - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.tcpSocket.port') - print_info " Liveness Probe: TCP Socket on port $PROBE_PORT (tested in port health check)" + PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].tcpSocket.port') + print_info " $PROBE_LABEL Probe: TCP Socket on port $PROBE_PORT (tested in port health check)" + PROBE_FACT=$(jq -nc --arg pod "$POD_NAME" --arg c "$CONTAINER_NAME" --arg pt "$PROBE_LABEL" --arg port "$PROBE_PORT" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "tcpSocket", port: $port, status: "not_tested_here"}') + add_fact PROBE_FACTS "$PROBE_FACT" elif [[ "$PROBE_TYPE" == "exec" ]]; then - PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.exec.command | join(" ")') - print_info " Liveness Probe: Exec [$PROBE_COMMAND] (cannot test directly)" + PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].exec.command | join(" ")') + print_info " $PROBE_LABEL Probe: Exec [$PROBE_COMMAND] (cannot test directly)" + PROBE_FACT=$(jq -nc --arg pod "$POD_NAME" --arg c "$CONTAINER_NAME" --arg pt "$PROBE_LABEL" --arg cmd "$PROBE_COMMAND" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "exec", command: $cmd, status: "untestable"}') + add_fact PROBE_FACTS "$PROBE_FACT" fi - fi - - # Check Startup Probe - if [[ -n "$HAS_STARTUP" ]]; then - PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r 'if .startupProbe.httpGet then "httpGet" elif .startupProbe.tcpSocket then "tcpSocket" elif .startupProbe.exec then "exec" else "unknown" end') - - if [[ "$PROBE_TYPE" == "httpGet" ]]; then - PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.httpGet.path') - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.httpGet.port') - PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.httpGet.scheme // "HTTP"') - PROBE_URL="${PROBE_SCHEME,,}://$POD_IP:$PROBE_PORT$PROBE_PATH" - - # Try curl first from agent, then wget - if command -v curl >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - elif command -v wget >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(wget --no-check-certificate -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(wget -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - # Parse wget output to extract HTTP status or error - if [[ $PROBE_EXIT_CODE -eq 0 ]]; then - PROBE_RESPONSE="200" - else - # Extract error from wget output - try multiple patterns - ERROR_MSG=$(echo "$PROBE_RESPONSE" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) - if [[ -n "$ERROR_MSG" ]]; then - # Shorten the message if too long - PROBE_RESPONSE=$(echo "$ERROR_MSG" | cut -c1-80) - else - # If no specific error found, show exit code - PROBE_RESPONSE="wget failed with exit code $PROBE_EXIT_CODE" - fi - fi - else - print_warning " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: Cannot test (curl/wget not available in agent)" - continue - fi - - if [[ $PROBE_EXIT_CODE -eq 0 && "$PROBE_RESPONSE" =~ ^[2-3][0-9][0-9]$ ]]; then - print_success " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✓ HTTP $PROBE_RESPONSE" - else - # Probe failed - check if it's config issue or app issue - if [[ "$PROBE_RESPONSE" =~ ^4[0-9][0-9]$ ]]; then - # 4xx error: endpoint not found or bad config - HAS_PROBE_ISSUES=1 - CONTAINER_HAS_4XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Startup" - print_error " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✗ HTTP $PROBE_RESPONSE - Health check endpoint not found" - elif [[ "$PROBE_RESPONSE" =~ ^5[0-9][0-9]$ ]]; then - # 5xx error: app has internal issues - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_5XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Startup" - print_warning " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ HTTP $PROBE_RESPONSE - Application error" - else - # Connection failed or other error (port not listening, network issue, etc) - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_CONNECTION_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Startup" - print_warning " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ Connection failed (response: $PROBE_RESPONSE, exit code: $PROBE_EXIT_CODE)" - fi - fi - elif [[ "$PROBE_TYPE" == "tcpSocket" ]]; then - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.tcpSocket.port') - print_info " Startup Probe: TCP Socket on port $PROBE_PORT" - elif [[ "$PROBE_TYPE" == "exec" ]]; then - PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.exec.command | join(" ")') - print_info " Startup Probe: Exec [$PROBE_COMMAND] (cannot test directly)" - fi - fi - - # Print consolidated action message for this container (avoid repetition) - if [[ -n "$FAILED_PROBES_LIST" ]]; then - echo "" - # Trim leading space from the list - FAILED_PROBES_LIST=$(echo "$FAILED_PROBES_LIST" | xargs) - - if [[ $CONTAINER_HAS_CONNECTION_ISSUES -eq 1 ]]; then - print_action "For $FAILED_PROBES_LIST probe(s): Verify port is listening and accessible from within cluster" - fi - - if [[ $CONTAINER_HAS_4XX_ISSUES -eq 1 ]]; then - print_action "For $FAILED_PROBES_LIST probe(s): Update probe path or implement the endpoint in application" - fi - - if [[ $CONTAINER_HAS_5XX_ISSUES -eq 1 ]]; then - print_action "For $FAILED_PROBES_LIST probe(s): Check application logs and fix internal errors or dependencies" - fi - fi + done done done echo "" + +DETAILS=$(jq -nc \ + --argjson tested "$CONTAINERS_TESTED" \ + --argjson skipped "$CONTAINERS_SKIPPED" \ + --argjson probes "$(facts_to_json_array PROBE_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + '{ + containers_tested: $tested, + containers_skipped: $skipped, + probe_results: $probes, + issue_count: ($issues | length), + issues: $issues + }') + if [[ $CONTAINERS_TESTED -eq 0 ]]; then - # No containers were tested - all were skipped print_info "All containers skipped - no probe checks could be performed" - update_check_result --status "skipped" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + EVIDENCE=$(evidence_json \ + "All containers skipped — no probe checks performed" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "skipped" --evidence "$EVIDENCE" elif [[ $HAS_PROBE_ISSUES -gt 0 ]]; then - # Some containers were tested and had issues if [[ $CONTAINERS_SKIPPED -gt 0 ]]; then print_warning "Probe issues found ($CONTAINERS_TESTED tested, $CONTAINERS_SKIPPED skipped)" fi - update_check_result --status "failed" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="Probe failures detected in $AFFECTED_COUNT pod(s)" + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Update probe path or implement health endpoint in application", "Verify port is listening and accessible"]') + update_check_result --status "failed" --evidence "$EVIDENCE" elif [[ $HAS_PROBE_WARNINGS -gt 0 ]]; then - # Some containers were tested and had warnings if [[ $CONTAINERS_SKIPPED -gt 0 ]]; then print_info "Probe warnings found ($CONTAINERS_TESTED tested, $CONTAINERS_SKIPPED skipped)" fi - update_check_result --status "warning" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="Probe warnings in $AFFECTED_COUNT pod(s) — application or connectivity issues" + EVIDENCE=$(evidence_json "$SUMMARY" "warning" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Check application logs for internal errors", "Verify port is listening and accessible"]') + update_check_result --status "warning" --evidence "$EVIDENCE" else - # All tested containers passed print_success "Health probes verified on $CONTAINERS_TESTED container(s)" - update_check_result --status "success" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + EVIDENCE=$(evidence_json \ + "Health probes verified on $CONTAINERS_TESTED container(s)" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/image_pull_status b/k8s/diagnose/scope/image_pull_status index 31e9d340..1bdfe38e 100644 --- a/k8s/diagnose/scope/image_pull_status +++ b/k8s/diagnose/scope/image_pull_status @@ -2,38 +2,63 @@ # Check: Image Pull Status # Verifies container images can be pulled from registry -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ERRORS=0 +PULL_FAILURES=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) IMAGE_PULL_ERRORS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.waiting.reason == "ImagePullBackOff" or .state.waiting.reason == "ErrImagePull") | .name') if [[ -n "$IMAGE_PULL_ERRORS" ]]; then - HAS_ERRORS=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: ImagePullBackOff/ErrImagePull in container(s): $IMAGE_PULL_ERRORS" for CONTAINER in $IMAGE_PULL_ERRORS; do IMAGE=$(echo "$POD_INFO" | jq -r ".spec.containers[] | select(.name==\"$CONTAINER\") | .image") - MESSAGE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.waiting.message") + MESSAGE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.waiting.message // \"\"") + REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.waiting.reason // \"\"") print_warning " Image: $IMAGE" print_warning " Reason: $MESSAGE" + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --arg image "$IMAGE" \ + --arg reason "$REASON" \ + --arg message "$MESSAGE" \ + '{pod: $pod, container: $container, image: $image, reason: $reason, message: $message}') + add_fact PULL_FAILURES "$FACT" done print_action "Verify image exists and imagePullSecrets are configured for private registries" fi done -if [[ $HAS_ERRORS -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +FAIL_COUNT=$(echo "$PULL_FAILURES" | jq 'length') + +if [[ $FAIL_COUNT -eq 0 ]]; then print_success "All $POD_COUNT pod(s) have images pulled successfully" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $POD_COUNT pod(s) have images pulled successfully" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) failing to pull images" + DETAILS=$(jq -nc \ + --argjson failures "$(facts_to_json_array PULL_FAILURES)" \ + --argjson pod_count "$POD_COUNT" \ + '{pod_count: $pod_count, failure_count: ($failures | length), failures: $failures}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Verify image exists and imagePullSecrets are configured for private registries"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/scope/memory_limits_check b/k8s/diagnose/scope/memory_limits_check index 90e280b8..af0cc0b4 100644 --- a/k8s/diagnose/scope/memory_limits_check +++ b/k8s/diagnose/scope/memory_limits_check @@ -2,22 +2,21 @@ # Check: Memory Limits # Checks for out-of-memory container terminations -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_OOM=0 +OOM_FACTS=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) OOM_KILLED=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.lastState.terminated.reason == "OOMKilled") | .name') if [[ -n "$OOM_KILLED" ]]; then - HAS_OOM=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: OOMKilled in container(s): $OOM_KILLED" for CONTAINER in $OOM_KILLED; do @@ -28,14 +27,50 @@ for POD_NAME in $PODS; do print_warning " Memory Limit: $MEMORY_LIMIT" print_warning " Memory Request: $MEMORY_REQUEST" print_action "Increase memory limits or optimize application memory usage" + + # When OOM kills a container, the previous-instance logs are where + # the last application output lives — the current instance was + # restarted by the kubelet after the kill. + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "previous") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --arg memory_limit "$MEMORY_LIMIT" \ + --arg memory_request "$MEMORY_REQUEST" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + '{ + pod: $pod, + container: $container, + memory_limit: $memory_limit, + memory_request: $memory_request, + previous_logs: $previous_logs + }') + add_fact OOM_FACTS "$FACT" done fi done -if [[ $HAS_OOM -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +OOM_COUNT=$(echo "$OOM_FACTS" | jq 'length') + +if [[ $OOM_COUNT -eq 0 ]]; then print_success "No OOMKilled containers detected in $POD_COUNT pod(s)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "No OOMKilled containers detected in $POD_COUNT pod(s)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) had OOMKilled containers" + DETAILS=$(jq -nc \ + --argjson oom "$(facts_to_json_array OOM_FACTS)" \ + --argjson pod_count "$POD_COUNT" \ + '{pod_count: $pod_count, oom_killed_count: ($oom | length), oom_killed: $oom}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Increase memory limits or optimize application memory usage"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/pod_existence b/k8s/diagnose/scope/pod_existence index b9919a9e..ffc3df34 100644 --- a/k8s/diagnose/scope/pod_existence +++ b/k8s/diagnose/scope/pod_existence @@ -7,10 +7,26 @@ PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$PODS" ]]; then print_error "No pods found with labels $LABEL_SELECTOR in namespace $NAMESPACE" print_action "Check deployment status and verify label selectors match" - update_check_result --status "failed" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "No pods found in namespace $NAMESPACE" \ + "critical" \ + "[]" \ + "$(jq -nc --arg ls "$LABEL_SELECTOR" --arg ns "$NAMESPACE" '{label_selector: $ls, namespace: $ns}')" \ + '["Check deployment status and verify label selectors match"]') + update_check_result --status "failed" --evidence "$EVIDENCE" return 1 fi -PODS_COUNT=$(echo "$PODS" | wc -w) +POD_NAMES_JSON=$(jq -c '[.items[].metadata.name]' "$PODS_FILE" 2>/dev/null) +PODS_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') print_success "Found $PODS_COUNT pod(s): $PODS" -update_check_result --status "success" --evidence "{}" + +EVIDENCE=$(evidence_json \ + "Found $PODS_COUNT pod(s) in namespace $NAMESPACE" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$PODS_COUNT" --argjson names "$POD_NAMES_JSON" --arg ns "$NAMESPACE" \ + '{pod_count: $count, pod_names: $names, namespace: $ns}')" \ + "[]") +update_check_result --status "success" --evidence "$EVIDENCE" diff --git a/k8s/diagnose/scope/pod_readiness b/k8s/diagnose/scope/pod_readiness index eecdcb32..798a973c 100644 --- a/k8s/diagnose/scope/pod_readiness +++ b/k8s/diagnose/scope/pod_readiness @@ -2,10 +2,8 @@ # Check: Pod Readiness # Confirms pod is running and ready to serve traffic -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') # Counters for summary @@ -16,46 +14,49 @@ NOT_READY_PODS=0 TERMINATING_PODS=0 STARTING_PODS=0 -# Deployment state detection HAS_TERMINATING_PODS=0 HAS_STARTING_PODS=0 +POD_FACTS=() +AFFECTED_PODS="" + + for POD_NAME in $PODS; do TOTAL_PODS=$((TOTAL_PODS + 1)) - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') - POD_READY=$(echo "$POD_INFO" | jq -r '.status.conditions[] | select(.type=="Ready") | .status') + POD_READY=$(echo "$POD_INFO" | jq -r '.status.conditions[]? | select(.type=="Ready") | .status') - # Check if pod is terminating DELETION_TIMESTAMP=$(echo "$POD_INFO" | jq -r '.metadata.deletionTimestamp // empty') if [[ -n "$DELETION_TIMESTAMP" ]]; then TERMINATING_PODS=$((TERMINATING_PODS + 1)) HAS_TERMINATING_PODS=1 print_info "Pod $POD_NAME: Terminating (rollout in progress)" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, state: "terminating"}') + add_fact POD_FACTS "$FACT" continue fi if [[ "$POD_PHASE" == "Running" && "$POD_READY" == "True" ]]; then READY_PODS=$((READY_PODS + 1)) print_success "Pod $POD_NAME: Running and Ready" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, state: "ready", phase: "Running"}') + add_fact POD_FACTS "$FACT" elif [[ "$POD_PHASE" == "Succeeded" ]]; then SUCCEEDED_PODS=$((SUCCEEDED_PODS + 1)) print_success "Pod $POD_NAME: Completed successfully" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, state: "succeeded", phase: "Succeeded"}') + add_fact POD_FACTS "$FACT" else NOT_READY_PODS=$((NOT_READY_PODS + 1)) - - # Detect if pod is in normal startup state and collect reasons IS_STARTING=0 STARTUP_INFO="" - # Check if pod is in Pending phase (normal during startup) if [[ "$POD_PHASE" == "Pending" ]]; then IS_STARTING=1 fi - # Check init containers first INIT_CONTAINER_INFO=$(echo "$POD_INFO" | jq -r ' .status.initContainerStatuses[]? | select(.state.waiting or .state.running) | @@ -71,7 +72,6 @@ for POD_NAME in $PODS; do STARTUP_INFO="Init: $(echo "$INIT_CONTAINER_INFO" | paste -sd ',' - | sed 's/,/, /g')" fi - # Check for normal container startup reasons with details CONTAINER_STARTUP_INFO=$(echo "$POD_INFO" | jq -r ' .status.containerStatuses[]? | select(.state.waiting) | @@ -79,7 +79,6 @@ for POD_NAME in $PODS; do ' 2>/dev/null) if [[ -n "$CONTAINER_STARTUP_INFO" ]]; then - # Check if any are normal startup reasons while IFS= read -r CONTAINER_LINE; do REASON=$(echo "$CONTAINER_LINE" | cut -d':' -f2 | xargs) case "$REASON" in @@ -97,32 +96,32 @@ for POD_NAME in $PODS; do fi fi + POD_STATE="not_ready" if [[ $IS_STARTING -eq 1 ]]; then STARTING_PODS=$((STARTING_PODS + 1)) HAS_STARTING_PODS=1 + POD_STATE="starting" if [[ -n "$STARTUP_INFO" ]]; then print_info "Pod $POD_NAME: Starting up - $STARTUP_INFO" else print_info "Pod $POD_NAME: Phase=$POD_PHASE (starting up)" fi else + mark_affected AFFECTED_PODS "$POD_NAME" print_warning "Pod $POD_NAME: Phase=$POD_PHASE, Ready=$POD_READY" fi - # Get detailed condition information - READY_CONDITION=$(echo "$POD_INFO" | jq -r '.status.conditions[] | select(.type=="Ready")') + READY_CONDITION=$(echo "$POD_INFO" | jq -c '.status.conditions[]? | select(.type=="Ready")' | head -1) READY_REASON=$(echo "$READY_CONDITION" | jq -r '.reason // "Unknown"') READY_MESSAGE=$(echo "$READY_CONDITION" | jq -r '.message // "No message available"') if [[ -n "$READY_REASON" && "$READY_REASON" != "Unknown" ]]; then print_warning " Reason: $READY_REASON" fi - if [[ -n "$READY_MESSAGE" && "$READY_MESSAGE" != "No message available" ]]; then print_warning " Message: $READY_MESSAGE" fi - # Check container statuses CONTAINER_STATUSES=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | "\(.name): Ready=\(.ready), Restarts=\(.restartCount)"' 2>/dev/null) if [[ -n "$CONTAINER_STATUSES" ]]; then @@ -132,7 +131,6 @@ for POD_NAME in $PODS; do done <<< "$CONTAINER_STATUSES" fi - # Check for waiting containers with reasons WAITING_CONTAINERS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.waiting) | " \(.name): \(.state.waiting.reason) - \(.state.waiting.message // "No details")"' 2>/dev/null) if [[ -n "$WAITING_CONTAINERS" ]]; then @@ -142,31 +140,93 @@ for POD_NAME in $PODS; do done fi - # Only show action if not in normal startup state if [[ $IS_STARTING -eq 0 ]]; then print_action "Check application health endpoint and ensure dependencies are available" fi + + # For pods stuck in "not_ready" (not just starting), embed current + # logs of the first container so the AI can correlate readiness + # failure with what the app printed. We skip starting pods because + # their logs are still in flight and not informative yet. + POD_LOGS_PER_CONTAINER="[]" + if [[ "$POD_STATE" == "not_ready" ]]; then + # Iterate first regular container only — keep payload bounded + FIRST_CONTAINER=$(echo "$POD_INFO" | jq -r '.spec.containers[0].name // empty') + if [[ -n "$FIRST_CONTAINER" ]]; then + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$FIRST_CONTAINER" "current") + POD_LOGS_PER_CONTAINER=$(jq -nc --arg c "$FIRST_CONTAINER" --argjson logs "$CURRENT_LOGS" \ + '[{container: $c, current_logs: $logs}]') + fi + fi + + FACT=$(jq -nc --arg p "$POD_NAME" --arg state "$POD_STATE" --arg phase "$POD_PHASE" --arg ready "$POD_READY" \ + --arg reason "$READY_REASON" --arg message "$READY_MESSAGE" --arg startup "$STARTUP_INFO" \ + --argjson container_logs "$POD_LOGS_PER_CONTAINER" \ + '{pod: $p, state: $state, phase: $phase, ready: $ready, reason: $reason, message: $message, startup_info: $startup, container_logs: $container_logs}') + add_fact POD_FACTS "$FACT" fi done -# Print summary echo "" + +DETAILS_BASE=$(jq -nc \ + --argjson total "$TOTAL_PODS" \ + --argjson ready "$READY_PODS" \ + --argjson succeeded "$SUCCEEDED_PODS" \ + --argjson not_ready "$NOT_READY_PODS" \ + --argjson terminating "$TERMINATING_PODS" \ + --argjson starting "$STARTING_PODS" \ + --argjson facts "$(facts_to_json_array POD_FACTS)" \ + '{ + total: $total, + ready: $ready, + succeeded: $succeeded, + not_ready: $not_ready, + terminating: $terminating, + starting: $starting, + pods: $facts + }') + if [[ $TOTAL_PODS -eq 0 ]]; then print_warning "No pods found" - update_check_result --status "failed" --evidence "{\"ready\":0,\"total\":0}" + EVIDENCE=$(evidence_json "No pods found" "critical" "[]" "$DETAILS_BASE" '["Check deployment and label selectors"]') + update_check_result --status "failed" --evidence "$EVIDENCE" elif [[ $READY_PODS -eq $TOTAL_PODS ]] || [[ $((READY_PODS + SUCCEEDED_PODS)) -eq $TOTAL_PODS ]]; then print_success "All pods ready: $READY_PODS/$TOTAL_PODS running and ready" - update_check_result --status "success" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS}" + EVIDENCE=$(evidence_json \ + "All $TOTAL_PODS pod(s) ready ($READY_PODS running, $SUCCEEDED_PODS succeeded)" \ + "info" \ + "[]" \ + "$DETAILS_BASE" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" elif [[ $HAS_TERMINATING_PODS -eq 1 ]]; then - # Pods are terminating - deployment/rollout in progress print_info "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready (rollout in progress with terminating pods)" - update_check_result --status "warning" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS,\"terminating\":$TERMINATING_PODS,\"deployment_in_progress\":true}" + DETAILS=$(echo "$DETAILS_BASE" | jq '. + {deployment_in_progress: true}') + EVIDENCE=$(evidence_json \ + "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready, $TERMINATING_PODS terminating" \ + "warning" \ + "[]" \ + "$DETAILS" \ + '["Wait for rollout to complete"]') + update_check_result --status "warning" --evidence "$EVIDENCE" elif [[ $HAS_STARTING_PODS -eq 1 ]]; then - # Pods are starting up normally - new deployment in progress print_info "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready, $STARTING_PODS starting up" - update_check_result --status "warning" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS,\"starting\":$STARTING_PODS,\"not_ready\":$NOT_READY_PODS,\"deployment_in_progress\":true}" + DETAILS=$(echo "$DETAILS_BASE" | jq '. + {deployment_in_progress: true}') + EVIDENCE=$(evidence_json \ + "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready, $STARTING_PODS starting" \ + "warning" \ + "[]" \ + "$DETAILS" \ + '["Wait for pods to finish starting"]') + update_check_result --status "warning" --evidence "$EVIDENCE" else - # Some pods not ready and no clear sign of deployment in progress - this is a problem print_error "Pods not ready: $READY_PODS/$TOTAL_PODS ready (pods have issues)" - update_check_result --status "failed" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS}" -fi \ No newline at end of file + EVIDENCE=$(evidence_json \ + "$READY_PODS/$TOTAL_PODS pods ready — $NOT_READY_PODS pod(s) have issues" \ + "critical" \ + "$(set_to_json_array AFFECTED_PODS)" \ + "$DETAILS_BASE" \ + '["Check application health endpoint and ensure dependencies are available"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/scope/resource_availability b/k8s/diagnose/scope/resource_availability index 53e99f97..55fa4ea5 100644 --- a/k8s/diagnose/scope/resource_availability +++ b/k8s/diagnose/scope/resource_availability @@ -2,16 +2,17 @@ # Check: Resource Availability # Validates pod can be scheduled with requested resources -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +UNSCHEDULABLE_FACTS=() +AFFECTED_PODS="" +HAS_INSUFFICIENT_CPU=0 +HAS_INSUFFICIENT_MEMORY=0 + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') @@ -19,27 +20,73 @@ for POD_NAME in $PODS; do UNSCHEDULABLE=$(echo "$POD_INFO" | jq -r '.status.conditions[] | select(.reason=="Unschedulable") | .message') if [[ -n "$UNSCHEDULABLE" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: Cannot be scheduled" print_warning " Reason: $UNSCHEDULABLE" + POD_HAS_CPU_ISSUE=0 + POD_HAS_MEMORY_ISSUE=0 if echo "$UNSCHEDULABLE" | grep -qi "insufficient cpu"; then print_warning " Issue: Insufficient CPU in cluster" + HAS_INSUFFICIENT_CPU=1 + POD_HAS_CPU_ISSUE=1 fi - if echo "$UNSCHEDULABLE" | grep -qi "insufficient memory"; then print_warning " Issue: Insufficient memory in cluster" + HAS_INSUFFICIENT_MEMORY=1 + POD_HAS_MEMORY_ISSUE=1 fi print_action "Reduce resource requests or add more nodes to cluster" + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg reason "$UNSCHEDULABLE" \ + --argjson insufficient_cpu "$POD_HAS_CPU_ISSUE" \ + --argjson insufficient_memory "$POD_HAS_MEMORY_ISSUE" \ + '{pod: $pod, reason: $reason, insufficient_cpu: ($insufficient_cpu == 1), insufficient_memory: ($insufficient_memory == 1)}') + add_fact UNSCHEDULABLE_FACTS "$FACT" fi fi done -if [[ $HAS_ISSUES -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$UNSCHEDULABLE_FACTS" | jq 'length') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All $POD_COUNT pod(s) successfully scheduled with sufficient resources" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $POD_COUNT pod(s) successfully scheduled" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY_PARTS=() + [[ $HAS_INSUFFICIENT_CPU -eq 1 ]] && SUMMARY_PARTS+=("insufficient CPU") + [[ $HAS_INSUFFICIENT_MEMORY -eq 1 ]] && SUMMARY_PARTS+=("insufficient memory") + if [[ ${#SUMMARY_PARTS[@]} -gt 0 ]]; then + SUMMARY_DETAIL=$(IFS=", "; echo "${SUMMARY_PARTS[*]}") + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) unschedulable — $SUMMARY_DETAIL" + else + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) unschedulable" + fi + + DETAILS=$(jq -nc \ + --argjson unscheduled "$(facts_to_json_array UNSCHEDULABLE_FACTS)" \ + --argjson pod_count "$POD_COUNT" \ + --argjson cpu "$HAS_INSUFFICIENT_CPU" \ + --argjson mem "$HAS_INSUFFICIENT_MEMORY" \ + '{ + pod_count: $pod_count, + unschedulable_count: ($unscheduled | length), + cluster_insufficient_cpu: ($cpu == 1), + cluster_insufficient_memory: ($mem == 1), + unschedulable: $unscheduled + }') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Reduce resource requests or add more nodes to cluster"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/storage_mounting b/k8s/diagnose/scope/storage_mounting index 09e17d16..c3c9a2dc 100644 --- a/k8s/diagnose/scope/storage_mounting +++ b/k8s/diagnose/scope/storage_mounting @@ -2,16 +2,15 @@ # Check: Storage Mounting # Verifies persistent volumes are bound and mounted -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_STORAGE_ISSUES=0 +PVC_FACTS=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) PVCS=$(echo "$POD_INFO" | jq -r '.spec.volumes[]? | select(.persistentVolumeClaim) | .persistentVolumeClaim.claimName') @@ -21,20 +20,36 @@ for POD_NAME in $PODS; do PVC_STATUS=$(kubectl get pvc "$PVC" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null) if [[ "$PVC_STATUS" == "Pending" ]]; then - HAS_STORAGE_ISSUES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: PVC $PVC is in Pending state" PVC_INFO=$(kubectl get pvc "$PVC" -n "$NAMESPACE" -o json 2>/dev/null) STORAGE_CLASS=$(echo "$PVC_INFO" | jq -r '.spec.storageClassName // "default"') - REQUESTED_SIZE=$(echo "$PVC_INFO" | jq -r '.spec.resources.requests.storage') + REQUESTED_SIZE=$(echo "$PVC_INFO" | jq -r '.spec.resources.requests.storage // ""') print_warning " Storage Class: $STORAGE_CLASS" print_warning " Requested Size: $REQUESTED_SIZE" print_action "Check if StorageClass exists and has available capacity" + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg pvc "$PVC" \ + --arg status "$PVC_STATUS" \ + --arg storage_class "$STORAGE_CLASS" \ + --arg requested_size "$REQUESTED_SIZE" \ + '{pod: $pod, pvc: $pvc, status: $status, storage_class: $storage_class, requested_size: $requested_size}') + add_fact PVC_FACTS "$FACT" elif [[ "$PVC_STATUS" == "Bound" ]]; then print_success "Pod $POD_NAME: PVC $PVC is Bound" else print_warning "Pod $POD_NAME: PVC $PVC status is $PVC_STATUS" + mark_affected AFFECTED_PODS "$POD_NAME" + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg pvc "$PVC" \ + --arg status "${PVC_STATUS:-Unknown}" \ + '{pod: $pod, pvc: $pvc, status: $status}') + add_fact PVC_FACTS "$FACT" fi done fi @@ -47,10 +62,26 @@ for POD_NAME in $PODS; do fi done -if [[ $HAS_STORAGE_ISSUES -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$PVC_FACTS" | jq 'length') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All volumes mounted successfully for $POD_COUNT pod(s)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All volumes mounted successfully for $POD_COUNT pod(s)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) have storage issues" + DETAILS=$(jq -nc \ + --argjson pvcs "$(facts_to_json_array PVC_FACTS)" \ + --argjson pod_count "$POD_COUNT" \ + '{pod_count: $pod_count, pvc_issue_count: ($pvcs | length), pvc_issues: $pvcs}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Check if StorageClass exists and has available capacity"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/service/service_endpoints b/k8s/diagnose/service/service_endpoints index a6fe12c5..72e72a6a 100644 --- a/k8s/diagnose/service/service_endpoints +++ b/k8s/diagnose/service/service_endpoints @@ -2,50 +2,53 @@ # Check: Service Endpoints # Checks if service has healthy endpoints -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ENDPOINT_FACTS=() +AFFECTED_SERVICES="" +NO_ENDPOINTS_RESOURCE=0 +NO_READY_COUNT=0 + for SERVICE_NAME in $SERVICES; do - # Get endpoints from pre-collected data ENDPOINTS_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null) if [[ -z "$ENDPOINTS_INFO" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" + NO_ENDPOINTS_RESOURCE=$((NO_ENDPOINTS_RESOURCE + 1)) print_error "Service $SERVICE_NAME: No endpoints resource found" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" \ + '{service: $svc, issue: "no_endpoints_resource", ready_count: 0, not_ready_count: 0}') + add_fact ENDPOINT_FACTS "$FACT" continue fi - # Check if endpoints has any addresses with detailed info - # Get ports for this subset PORTS=$(echo "$ENDPOINTS_INFO" | jq -r '.subsets[0]?.ports[]? | "\(.port):\(.name // "unnamed")"' 2>/dev/null | head -1) PORT_NUMBER=$(echo "$PORTS" | cut -d':' -f1) - PORT_NAME=$(echo "$PORTS" | cut -d':' -f2) - READY_ENDPOINTS=$(echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.addresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) - NOT_READY_ENDPOINTS=$(echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.notReadyAddresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) + READY_COUNT=$(echo "$ENDPOINTS_INFO" | jq -r '[.subsets[]?.addresses[]?] | length' 2>/dev/null) + NOT_READY_COUNT=$(echo "$ENDPOINTS_INFO" | jq -r '[.subsets[]?.notReadyAddresses[]?] | length' 2>/dev/null) + READY_COUNT=${READY_COUNT:-0} + NOT_READY_COUNT=${NOT_READY_COUNT:-0} - READY_COUNT=$(echo "$READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) - NOT_READY_COUNT=$(echo "$NOT_READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) + READY_LIST=$(echo "$ENDPOINTS_INFO" | jq -c '[.subsets[]?.addresses[]? | {pod: (.targetRef.name // "unknown"), ip: .ip}]' 2>/dev/null) + NOT_READY_LIST=$(echo "$ENDPOINTS_INFO" | jq -c '[.subsets[]?.notReadyAddresses[]? | {pod: (.targetRef.name // "unknown"), ip: .ip}]' 2>/dev/null) if [[ $READY_COUNT -eq 0 ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" + NO_READY_COUNT=$((NO_READY_COUNT + 1)) print_error "Service $SERVICE_NAME: No ready endpoints available" if [[ $NOT_READY_COUNT -gt 0 ]]; then - print_warning " Not ready endpoints: $NOT_READY_COUNT" - # Show details of not ready endpoints - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - if [[ -n "$IP" ]]; then - if [[ -n "$PORT_NUMBER" ]]; then - print_warning " - $POD_NAME -> $IP:$PORT_NUMBER" - else - print_warning " - $POD_NAME -> $IP" - fi + print_warning " $NOT_READY_COUNT not ready endpoint(s):" + echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.notReadyAddresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)"' | while IFS= read -r line; do + if [[ -n "$PORT_NUMBER" ]]; then + print_warning "${line}:${PORT_NUMBER}" + else + print_warning "$line" fi done print_action "Check pod readiness probes and pod status" @@ -53,39 +56,61 @@ for SERVICE_NAME in $SERVICES; do print_warning " No endpoints at all" print_action "Verify service selector matches pod labels" fi + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" \ + --argjson ready_count "$READY_COUNT" --argjson not_ready_count "$NOT_READY_COUNT" \ + --argjson not_ready "$NOT_READY_LIST" \ + '{service: $svc, issue: "no_ready_endpoints", ready_count: $ready_count, not_ready_count: $not_ready_count, not_ready_endpoints: $not_ready}') + add_fact ENDPOINT_FACTS "$FACT" else print_success "Service $SERVICE_NAME: $READY_COUNT ready endpoint(s)" - - # Show details of ready endpoints - echo "$READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - if [[ -n "$IP" ]]; then - if [[ -n "$PORT_NUMBER" ]]; then - print_success " - $POD_NAME -> $IP:$PORT_NUMBER" - else - print_success " - $POD_NAME -> $IP" - fi + echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.addresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)"' | while IFS= read -r line; do + if [[ -n "$PORT_NUMBER" ]]; then + print_success "${line}:${PORT_NUMBER}" + else + print_success "$line" fi done if [[ $NOT_READY_COUNT -gt 0 ]]; then print_warning " Also has $NOT_READY_COUNT not ready endpoint(s)" - # Show details of not ready endpoints - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - if [[ -n "$IP" ]]; then - if [[ -n "$PORT_NUMBER" ]]; then - print_warning " - $POD_NAME -> $IP:$PORT_NUMBER" - else - print_warning " - $POD_NAME -> $IP" - fi - fi - done print_action "Check pod readiness probes and pod status" fi + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" \ + --argjson ready_count "$READY_COUNT" --argjson not_ready_count "$NOT_READY_COUNT" \ + --argjson ready "$READY_LIST" --argjson not_ready "$NOT_READY_LIST" \ + '{service: $svc, ready_count: $ready_count, not_ready_count: $not_ready_count, ready_endpoints: $ready, not_ready_endpoints: $not_ready}') + add_fact ENDPOINT_FACTS "$FACT" fi done -if [[ $HAS_ISSUES -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) have ready endpoints" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array ENDPOINT_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) without ready endpoints" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array ENDPOINT_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + --argjson no_resource "$NO_ENDPOINTS_RESOURCE" \ + --argjson no_ready "$NO_READY_COUNT" \ + '{ + service_count: $count, + issue_count: ($facts | map(select(.issue != null)) | length), + no_endpoints_resource_count: $no_resource, + no_ready_endpoints_count: $no_ready, + services: $facts + }') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Check pod readiness probes and verify service selector matches pod labels"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/service/service_existence b/k8s/diagnose/service/service_existence index 2ee8783f..d57e3d4f 100644 --- a/k8s/diagnose/service/service_existence +++ b/k8s/diagnose/service/service_existence @@ -7,10 +7,26 @@ SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' if [[ -z "$SERVICES" ]]; then print_error "No services found with labels $LABEL_SELECTOR in namespace $NAMESPACE" print_action "Create service resource or verify label selectors" - update_check_result --status "failed" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "No services found in namespace $NAMESPACE" \ + "critical" \ + "[]" \ + "$(jq -nc --arg ls "$LABEL_SELECTOR" --arg ns "$NAMESPACE" '{label_selector: $ls, namespace: $ns}')" \ + '["Create service resource or verify label selectors"]') + update_check_result --status "failed" --evidence "$EVIDENCE" return 1 fi -SERVICE_COUNT=$(echo "$SERVICES" | wc -w) +SERVICE_NAMES_JSON=$(jq -c '[.items[].metadata.name]' "$SERVICES_FILE" 2>/dev/null) +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') print_success "Found $SERVICE_COUNT service(s): $SERVICES" -update_check_result --status "success" --evidence "{}" + +EVIDENCE=$(evidence_json \ + "Found $SERVICE_COUNT service(s) in namespace $NAMESPACE" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$SERVICE_COUNT" --argjson names "$SERVICE_NAMES_JSON" --arg ns "$NAMESPACE" \ + '{service_count: $count, service_names: $names, namespace: $ns}')" \ + "[]") +update_check_result --status "success" --evidence "$EVIDENCE" diff --git a/k8s/diagnose/service/service_port_configuration b/k8s/diagnose/service/service_port_configuration index 79baa675..098dd4e4 100644 --- a/k8s/diagnose/service/service_port_configuration +++ b/k8s/diagnose/service/service_port_configuration @@ -2,36 +2,36 @@ # Check: Service Port Configuration # Validates service and container port alignment -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_PORT_ISSUES=0 +PORT_FACTS=() +AFFECTED_SERVICES="" + for SERVICE_NAME in $SERVICES; do - # Get service info from pre-collected data SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) - # Get service ports and targetPorts SERVICE_PORTS=$(echo "$SERVICE_INFO" | jq -r '.spec.ports[] | "\(.port):\(.targetPort):\(.name // "unnamed")"') if [[ -z "$SERVICE_PORTS" ]]; then - HAS_PORT_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error "Service $SERVICE_NAME: No ports defined" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_ports_defined", ports: []}') + add_fact PORT_FACTS "$FACT" continue fi - # Get service selector to find pods SERVICE_SELECTORS=$(echo "$SERVICE_INFO" | jq -c '.spec.selector') if [[ -z "$SERVICE_SELECTORS" || "$SERVICE_SELECTORS" == "null" ]]; then print_warning "Service $SERVICE_NAME: No selector, skipping port validation" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_selector_skipped"}') + add_fact PORT_FACTS "$FACT" continue fi - # Find pods from pre-collected data that match service selectors PODS=$(jq -r --argjson selectors "$SERVICE_SELECTORS" ' .items[] | . as $pod | @@ -45,24 +45,24 @@ for SERVICE_NAME in $SERVICES; do if [[ -z "$PODS" ]]; then print_warning "Service $SERVICE_NAME: No pods found to validate ports" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_pods_for_validation"}') + add_fact PORT_FACTS "$FACT" continue fi - # Check first pod for port validation FIRST_POD=$(echo "$PODS" | awk '{print $1}') POD_INFO=$(jq --arg name "$FIRST_POD" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) print_info "Service $SERVICE_NAME port configuration:" - # Validate configuration and test connectivity - # Use process substitution to avoid subshell and preserve HAS_PORT_ISSUES updates + PORT_RESULTS=() + SERVICE_HAS_ISSUE=0 + while IFS=':' read -r SERVICE_PORT TARGET_PORT PORT_NAME; do ACTUAL_TARGET_PORT="$TARGET_PORT" CONTAINER_NAME="" - # Check if targetPort is numeric or named if [[ "$TARGET_PORT" =~ ^[0-9]+$ ]]; then - # Numeric targetPort - find which container has this port CONTAINER_INFO=$(echo "$POD_INFO" | jq -r --arg port "$TARGET_PORT" ' .spec.containers[] | select(.ports[]?.containerPort == ($port | tonumber)) | @@ -72,17 +72,22 @@ for SERVICE_NAME in $SERVICES; do if [[ -n "$CONTAINER_INFO" ]]; then CONTAINER_NAME=$(echo "$CONTAINER_INFO" | cut -d':' -f1) print_success " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Configuration OK [container: $CONTAINER_NAME]" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" --arg cont "$CONTAINER_NAME" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, container: $cont, status: "ok"}') else - HAS_PORT_ISSUES=1 - # Show available ports per container - AVAILABLE_PORTS=$(echo "$POD_INFO" | jq -r '.spec.containers[] | "\(.name): \([.ports[]?.containerPort] | join(","))"' | tr '\n' '; ') + SERVICE_HAS_ISSUE=1 + AVAILABLE_PORTS=$(echo "$POD_INFO" | jq -c '[.spec.containers[] | {container: .name, ports: [.ports[]?.containerPort]}]') print_error " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Container port $TARGET_PORT not found" - print_warning " Available ports by container: $AVAILABLE_PORTS" + AVAILABLE_HUMAN=$(echo "$POD_INFO" | jq -r '.spec.containers[] | "\(.name): \([.ports[]?.containerPort] | join(","))"' | tr '\n' '; ') + print_warning " Available ports by container: $AVAILABLE_HUMAN" print_action "Update service targetPort to match container port or fix container port" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" \ + --argjson available "$AVAILABLE_PORTS" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, status: "container_port_not_found", available_ports_by_container: $available}') + add_fact PORT_RESULTS "$PORT_RESULT" continue fi else - # Named port - find which container has this named port CONTAINER_INFO=$(echo "$POD_INFO" | jq -r --arg portname "$TARGET_PORT" ' .spec.containers[] | select(.ports[]? | select(.name == $portname)) | @@ -93,34 +98,63 @@ for SERVICE_NAME in $SERVICES; do CONTAINER_NAME=$(echo "$CONTAINER_INFO" | cut -d':' -f1) ACTUAL_TARGET_PORT=$(echo "$CONTAINER_INFO" | cut -d':' -f2) print_success " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Resolves to $ACTUAL_TARGET_PORT [container: $CONTAINER_NAME]" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" --arg cont "$CONTAINER_NAME" --arg actual "$ACTUAL_TARGET_PORT" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, container: $cont, resolved_port: $actual, status: "ok"}') else - HAS_PORT_ISSUES=1 + SERVICE_HAS_ISSUE=1 print_error " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Named port not found in containers" print_action "Define named port in container spec or use numeric targetPort" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, status: "named_port_not_found"}') + add_fact PORT_RESULTS "$PORT_RESULT" continue fi fi - # Active connectivity check - verify application is listening on the port + # Active connectivity check print_info " Testing connectivity to port $ACTUAL_TARGET_PORT in container '$CONTAINER_NAME'..." - - # Try to connect to the port from inside the specific container - CONNECTIVITY_TEST=$(kubectl exec "$FIRST_POD" -n "$NAMESPACE" -c "$CONTAINER_NAME" -- timeout 2 sh -c "command -v nc >/dev/null 2>&1 && nc -z localhost $ACTUAL_TARGET_PORT || (command -v curl >/dev/null 2>&1 && curl -s --max-time 1 localhost:$ACTUAL_TARGET_PORT >/dev/null)" 2>&1) + kubectl exec "$FIRST_POD" -n "$NAMESPACE" -c "$CONTAINER_NAME" -- timeout 2 sh -c "command -v nc >/dev/null 2>&1 && nc -z localhost $ACTUAL_TARGET_PORT || (command -v curl >/dev/null 2>&1 && curl -s --max-time 1 localhost:$ACTUAL_TARGET_PORT >/dev/null)" 2>&1 >/dev/null CONNECTIVITY_EXIT_CODE=$? if [[ $CONNECTIVITY_EXIT_CODE -eq 0 ]]; then print_success " ✓ Port $ACTUAL_TARGET_PORT is accepting connections" + PORT_RESULT=$(echo "$PORT_RESULT" | jq '. + {connectivity: "ok"}') else - HAS_PORT_ISSUES=1 + SERVICE_HAS_ISSUE=1 print_error " ✗ Port $ACTUAL_TARGET_PORT is NOT accepting connections" print_warning " Configuration is correct but application may not be listening on port $ACTUAL_TARGET_PORT" print_info " Check logs: kubectl logs $FIRST_POD -n $NAMESPACE -c $CONTAINER_NAME" + PORT_RESULT=$(echo "$PORT_RESULT" | jq '. + {connectivity: "refused", status: "not_listening"}') fi + + add_fact PORT_RESULTS "$PORT_RESULT" done < <(echo "$SERVICE_PORTS") + + [[ $SERVICE_HAS_ISSUE -eq 1 ]] && mark_affected AFFECTED_SERVICES "$SERVICE_NAME" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --argjson ports "$(facts_to_json_array PORT_RESULTS)" \ + '{service: $svc, ports: $ports}') + add_fact PORT_FACTS "$FACT" done -if [[ $HAS_PORT_ISSUES -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) have valid port configuration" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array PORT_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) have port configuration issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array PORT_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + '{service_count: $count, issue_count: ($facts | map(select(.ports[]? | .status != "ok")) | length), services: $facts}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Verify container is listening on targetPort and that ports/protocols match"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/service/service_selector_match b/k8s/diagnose/service/service_selector_match index 84667a7e..a69a02a8 100644 --- a/k8s/diagnose/service/service_selector_match +++ b/k8s/diagnose/service/service_selector_match @@ -2,32 +2,29 @@ # Check: Service Selector Match # Validates service selectors match pod labels -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_MISMATCH=0 +SELECTOR_FACTS=() +AFFECTED_SERVICES="" + for SERVICE_NAME in $SERVICES; do - # Get service info from pre-collected data SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) - # Get service selectors SERVICE_SELECTORS=$(echo "$SERVICE_INFO" | jq -r '.spec.selector | to_entries | map("\(.key)=\(.value)") | join(",")') + SELECTOR_OBJECT=$(echo "$SERVICE_INFO" | jq -c '.spec.selector') if [[ -z "$SERVICE_SELECTORS" || "$SERVICE_SELECTORS" == "null" ]]; then - HAS_MISMATCH=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error "Service $SERVICE_NAME: No selector defined" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_selector"}') + add_fact SELECTOR_FACTS "$FACT" continue fi - # Find pods that match service selector from pre-collected data - # Get service selector as a proper object - SELECTOR_OBJECT=$(echo "$SERVICE_INFO" | jq -c '.spec.selector') - - # Match pods where all service selectors are present in pod labels MATCHING_PODS=$(jq -r --argjson selectors "$SELECTOR_OBJECT" ' .items[] | . as $pod | @@ -40,30 +37,33 @@ for SERVICE_NAME in $SERVICES; do ' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$MATCHING_PODS" ]]; then - HAS_MISMATCH=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error "Service $SERVICE_NAME: No pods match selector ($SERVICE_SELECTORS)" - # Show what pods exist with deployment_id from pre-collected data EXISTING_PODS=$(jq -r --arg dep_id "$DEPLOYMENT_ID" '.items[] | select(.metadata.labels.deployment_id == $dep_id) | .metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') + MISMATCH_FACTS=() + MISMATCH_FACTS_JSON="[]" + if [[ -n "$EXISTING_PODS" ]]; then print_warning " Existing pods with deployment_id: $EXISTING_PODS" - # Show first pod's labels for comparison FIRST_POD=$(echo "$EXISTING_PODS" | awk '{print $1}') POD_LABELS=$(jq -r --arg pod "$FIRST_POD" '.items[] | select(.metadata.name == $pod) | .metadata.labels | to_entries | map("\(.key)=\(.value)") | join(",")' "$PODS_FILE" 2>/dev/null) print_info " Pod labels: $POD_LABELS" - # Check each selector against pod labels and show only mismatches - MISMATCHES="" MATCH_COUNT=0 SELECTOR_COUNT=0 while IFS='=' read -r key value; do SELECTOR_COUNT=$((SELECTOR_COUNT + 1)) POD_VALUE=$(jq -r --arg pod "$FIRST_POD" --arg key "$key" '.items[] | select(.metadata.name == $pod) | .metadata.labels[$key] // "MISSING"' "$PODS_FILE" 2>/dev/null) if [[ "$POD_VALUE" == "MISSING" ]]; then - MISMATCHES="${MISMATCHES} ✗ $key: selector='$value', pod=MISSING\n" + MM=$(jq -nc --arg key "$key" --arg expected "$value" --arg actual "MISSING" \ + '{key: $key, selector_value: $expected, pod_value: $actual, kind: "missing"}') + add_fact MISMATCH_FACTS "$MM" elif [[ "$POD_VALUE" != "$value" ]]; then - MISMATCHES="${MISMATCHES} ✗ $key: selector='$value', pod='$POD_VALUE'\n" + MM=$(jq -nc --arg key "$key" --arg expected "$value" --arg actual "$POD_VALUE" \ + '{key: $key, selector_value: $expected, pod_value: $actual, kind: "mismatch"}') + add_fact MISMATCH_FACTS "$MM" else MATCH_COUNT=$((MATCH_COUNT + 1)) fi @@ -71,35 +71,50 @@ for SERVICE_NAME in $SERVICES; do print_info " Selector check: $MATCH_COUNT/$SELECTOR_COUNT labels match" - if [[ -n "$MISMATCHES" ]]; then + MISMATCH_FACTS_JSON=$(facts_to_json_array MISMATCH_FACTS) + if [[ ${#MISMATCH_FACTS[@]} -gt 0 ]]; then print_warning " Selector mismatches:" - echo -e "$MISMATCHES" - else - print_warning " All selectors match but jq query failed - checking jq logic..." - # Debug: try the query manually to see what happens - DEBUG_RESULT=$(jq --argjson selectors "$SELECTOR_OBJECT" --arg pod "$FIRST_POD" ' - .items[] | select(.metadata.name == $pod) | - . as $p | - { - pod: .metadata.name, - matches: ($selectors | to_entries | all(.key as $k | .value as $v | - $p.metadata.labels[$k] == $v - )) - } - ' "$PODS_FILE" 2>&1) - print_info " Debug result: $DEBUG_RESULT" + echo "$MISMATCH_FACTS_JSON" | jq -r '.[] | " ✗ " + .key + ": selector=" + .selector_value + ", pod=" + .pod_value' | while IFS= read -r line; do + print_warning "$line" + done fi print_action "Verify pod labels match service selector" fi + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --argjson sel "$SELECTOR_OBJECT" \ + --argjson mismatches "$MISMATCH_FACTS_JSON" \ + --arg existing_pods "$EXISTING_PODS" \ + '{service: $svc, issue: "no_matching_pods", selector: $sel, existing_pods_with_deployment_id: ($existing_pods | split(" ") | map(select(length > 0))), label_mismatches: $mismatches}') + add_fact SELECTOR_FACTS "$FACT" else - POD_COUNT=$(echo "$MATCHING_PODS" | wc -w) + POD_COUNT=$(echo "$MATCHING_PODS" | wc -w | tr -d ' ') print_success "Service $SERVICE_NAME: Selector matches $POD_COUNT pod(s)" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --argjson sel "$SELECTOR_OBJECT" \ + --argjson matched "$POD_COUNT" \ + '{service: $svc, selector: $sel, matched_pod_count: $matched}') + add_fact SELECTOR_FACTS "$FACT" fi done -if [[ $HAS_MISMATCH -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) match at least one pod" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array SELECTOR_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) have selector issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array SELECTOR_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + '{service_count: $count, issue_count: ($facts | map(select(.issue != null)) | length), services: $facts}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Verify pod labels match service selector"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/service/service_type_validation b/k8s/diagnose/service/service_type_validation index 8381c2d7..feacbbbe 100644 --- a/k8s/diagnose/service/service_type_validation +++ b/k8s/diagnose/service/service_type_validation @@ -2,75 +2,111 @@ # Check: Service Type Validation # Verifies service type is correctly configured -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +SERVICE_FACTS=() +AFFECTED_SERVICES="" + for SERVICE_NAME in $SERVICES; do - # Get service info from pre-collected data SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) - SERVICE_TYPE=$(echo "$SERVICE_INFO" | jq -r '.spec.type') print_info "Service $SERVICE_NAME: Type=$SERVICE_TYPE" + # Per-service fact (regardless of issue) case "$SERVICE_TYPE" in ClusterIP) CLUSTER_IP=$(echo "$SERVICE_INFO" | jq -r '.spec.clusterIP') if [[ "$CLUSTER_IP" == "None" ]]; then print_success " Headless service (ClusterIP: None)" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" \ + '{service: $svc, type: $type, cluster_ip: "None", headless: true}') else print_success " Internal service with ClusterIP: $CLUSTER_IP" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --arg ip "$CLUSTER_IP" \ + '{service: $svc, type: $type, cluster_ip: $ip, headless: false}') fi ;; NodePort) - NODE_PORTS=$(echo "$SERVICE_INFO" | jq -r '.spec.ports[] | "\(.port):\(.nodePort)"') + NODE_PORTS_LIST=$(echo "$SERVICE_INFO" | jq -c '[.spec.ports[] | {port: .port, node_port: .nodePort}]') print_success " NodePort service exposed on:" - echo "$NODE_PORTS" | while IFS=':' read -r PORT NODE_PORT; do - print_info " Port $PORT -> NodePort $NODE_PORT" + echo "$SERVICE_INFO" | jq -r '.spec.ports[] | " Port \(.port) -> NodePort \(.nodePort)"' | while IFS= read -r line; do + print_info "$line" done + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --argjson ports "$NODE_PORTS_LIST" \ + '{service: $svc, type: $type, node_ports: $ports}') ;; LoadBalancer) EXTERNAL_IP=$(echo "$SERVICE_INFO" | jq -r '.status.loadBalancer.ingress[0].ip // .status.loadBalancer.ingress[0].hostname // "Pending"') if [[ "$EXTERNAL_IP" == "Pending" || "$EXTERNAL_IP" == "null" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_warning " LoadBalancer IP/Hostname is Pending" print_info " This may take a few minutes to provision" - # Check for events related to LoadBalancer from pre-collected data LB_EVENTS=$(jq -r --arg name "$SERVICE_NAME" '.items[] | select(.involvedObject.name == $name and (.message | test("loadbalancer|external"; "i"))) | "\(.lastTimestamp) \(.message)"' "$EVENTS_FILE" 2>/dev/null | tail -n 3) if [[ -n "$LB_EVENTS" ]]; then print_info " Recent events:" echo "$LB_EVENTS" | sed 's/^/ /' fi print_action "Wait for provisioning or check cloud provider logs for errors" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" \ + '{service: $svc, type: $type, issue: "loadbalancer_pending"}') + add_fact ISSUE_FACTS "$FACT" + SVC_FACT="$FACT" else print_success " LoadBalancer available at: $EXTERNAL_IP" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --arg addr "$EXTERNAL_IP" \ + '{service: $svc, type: $type, external_address: $addr}') fi ;; ExternalName) EXTERNAL_NAME=$(echo "$SERVICE_INFO" | jq -r '.spec.externalName') print_success " ExternalName service pointing to: $EXTERNAL_NAME" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --arg ext "$EXTERNAL_NAME" \ + '{service: $svc, type: $type, external_name: $ext}') ;; *) - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error " Unknown service type: $SERVICE_TYPE" print_action "Use valid service type (ClusterIP, NodePort, LoadBalancer, or ExternalName)" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "${SERVICE_TYPE:-null}" \ + '{service: $svc, type: $type, issue: "unknown_service_type"}') + add_fact ISSUE_FACTS "$FACT" + SVC_FACT="$FACT" ;; esac + add_fact SERVICE_FACTS "$SVC_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +ISSUE_COUNT=$(echo "$ISSUE_FACTS" | jq 'length') +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) have valid types" \ + "info" \ + "[]" \ + "$(jq -nc --argjson services "$(facts_to_json_array SERVICE_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $services}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) have type issues" + DETAILS=$(jq -nc \ + --argjson services "$(facts_to_json_array SERVICE_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + '{service_count: $count, issue_count: ($issues | length), services: $services, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "warning" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Wait for provisioning or check cloud provider logs for errors"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/tests/build_context.bats b/k8s/diagnose/tests/build_context.bats index 46eaa5e2..dfa68081 100644 --- a/k8s/diagnose/tests/build_context.bats +++ b/k8s/diagnose/tests/build_context.bats @@ -25,12 +25,15 @@ setup() { *"app.kubernetes.io/name=aws-load-balancer-controller"*) echo '{"items":[]}' ;; *"app=aws-alb-ingress-controller"*) echo '{"items":[]}' ;; *"get pods"*) echo '{"items":[{"metadata":{"name":"test-pod"}}]}' ;; + *"get deployment"*) echo '{"items":[{"metadata":{"name":"test-deployment"}}]}' ;; + *"get rs"*) echo '{"items":[{"metadata":{"name":"test-rs"}}]}' ;; *"get services"*) echo '{"items":[{"metadata":{"name":"test-service"}}]}' ;; *"get endpoints"*) echo '{"items":[]}' ;; *"get ingress"*) echo '{"items":[]}' ;; *"get secrets"*) echo '{"items":[]}' ;; *"get ingressclass"*) echo '{"items":[]}' ;; *"get events"*) echo '{"items":[]}' ;; + *"describe pod"*) echo "Pod describe output" ;; *"logs"*) echo "log line 1" ;; *) echo '{"items":[]}' ;; esac @@ -99,13 +102,19 @@ run_build_context() { assert_directory_exists "$NP_OUTPUT_DIR/data" assert_directory_exists "$NP_OUTPUT_DIR/data/alb_controller_logs" + assert_directory_exists "$POD_LOGS_DIR" + assert_directory_exists "$POD_DESCRIBE_DIR" # All resource files should exist and be valid JSON - for file in "$PODS_FILE" "$SERVICES_FILE" "$ENDPOINTS_FILE" "$INGRESSES_FILE" \ - "$SECRETS_FILE" "$INGRESSCLASSES_FILE" "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE"; do + for file in "$PODS_FILE" "$DEPLOYMENTS_FILE" "$REPLICASETS_FILE" "$SERVICES_FILE" \ + "$ENDPOINTS_FILE" "$INGRESSES_FILE" "$SECRETS_FILE" "$INGRESSCLASSES_FILE" \ + "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE"; do assert_file_exists "$file" jq . "$file" >/dev/null done + + # problematic_pods.txt is plain text, just assert it exists + assert_file_exists "$PROBLEMATIC_PODS_FILE" } @test "build_context: secrets.json excludes sensitive data field" { @@ -183,3 +192,264 @@ run_build_context() { log_content=$(cat "$ALB_CONTROLLER_LOGS_DIR/alb-controller-pod.log") assert_contains "$log_content" "controller log line" } + +# ============================================================================= +# Problematic Pod Detection +# ============================================================================= +@test "build_context: healthy running pod is not flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"healthy-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Running", + "conditions":[{"type":"Ready","status":"True"}], + "containerStatuses":[{"name":"app","restartCount":0,"state":{"running":{}},"lastState":{}}] + } + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_empty "$problematic" +} + +@test "build_context: pod in CrashLoopBackOff is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"crash-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Running", + "conditions":[{"type":"Ready","status":"False"}], + "containerStatuses":[{"name":"app","restartCount":5,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *"describe pod crash-pod"*) echo "describe output for crash-pod" ;; + *"logs"*"crash-pod"*"--previous"*) echo "previous crash log" ;; + *"logs"*"crash-pod"*) echo "current log" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "crash-pod" +} + +@test "build_context: pod in Pending phase is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"pending-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Pending"} + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "pending-pod" +} + +@test "build_context: pod with terminating deletionTimestamp is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"terminating-pod","deletionTimestamp":"2026-01-01T00:00:00Z"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Running","conditions":[{"type":"Ready","status":"True"}],"containerStatuses":[{"name":"app","restartCount":0,"state":{"running":{}},"lastState":{}}]} + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "terminating-pod" +} + +@test "build_context: pod with failed init container is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"init-fail-pod"}, + "spec":{"initContainers":[{"name":"init-db"}],"containers":[{"name":"app"}]}, + "status":{ + "phase":"Pending", + "initContainerStatuses":[{"name":"init-db","restartCount":3,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "init-fail-pod" +} + +# ============================================================================= +# Pod Logs and Describe Capture +# ============================================================================= +@test "build_context: captures describe and current logs for problematic pod" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"crash-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Running", + "containerStatuses":[{"name":"app","restartCount":2,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *"describe pod crash-pod"*) echo "describe output for crash-pod" ;; + *"logs"*"crash-pod"*"--previous"*) echo "previous crash log" ;; + *"logs"*"crash-pod"*) echo "current log line" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + assert_file_exists "$POD_DESCRIBE_DIR/crash-pod.txt" + describe_content=$(cat "$POD_DESCRIBE_DIR/crash-pod.txt") + assert_contains "$describe_content" "describe output for crash-pod" + + assert_file_exists "$POD_LOGS_DIR/crash-pod.app.log" + current_log=$(cat "$POD_LOGS_DIR/crash-pod.app.log") + assert_contains "$current_log" "current log line" + + assert_file_exists "$POD_LOGS_DIR/crash-pod.app.previous.log" + previous_log=$(cat "$POD_LOGS_DIR/crash-pod.app.previous.log") + assert_contains "$previous_log" "previous crash log" +} + +@test "build_context: skips empty previous logs (container never crashed before)" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"new-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Pending", + "containerStatuses":[{"name":"app","restartCount":0,"state":{"waiting":{"reason":"ImagePullBackOff"}},"lastState":{}}] + } + }]}' + ;; + *"describe pod new-pod"*) echo "describe output" ;; + *"logs"*"new-pod"*"--previous"*) return 1 ;; + *"logs"*"new-pod"*) echo "current log" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + # Current log should be saved + assert_file_exists "$POD_LOGS_DIR/new-pod.app.log" + + # Previous log should NOT exist (kubectl returned no output) + [ ! -f "$POD_LOGS_DIR/new-pod.app.previous.log" ] +} + +@test "build_context: captures logs for all containers including init containers" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"multi-container-pod"}, + "spec":{ + "initContainers":[{"name":"init-db"}], + "containers":[{"name":"app"},{"name":"sidecar"}] + }, + "status":{ + "phase":"Pending", + "initContainerStatuses":[{"name":"init-db","restartCount":1,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *"describe pod multi-container-pod"*) echo "describe output" ;; + *"logs"*"-c init-db"*"--previous"*) echo "init previous" ;; + *"logs"*"-c init-db"*) echo "init current" ;; + *"logs"*"-c app"*"--previous"*) return 1 ;; + *"logs"*"-c app"*) echo "app current" ;; + *"logs"*"-c sidecar"*"--previous"*) return 1 ;; + *"logs"*"-c sidecar"*) echo "sidecar current" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + # All three containers' current logs should exist + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.init-db.log" + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.app.log" + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.sidecar.log" + + # Only init-db has a previous log + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.init-db.previous.log" + [ ! -f "$POD_LOGS_DIR/multi-container-pod.app.previous.log" ] + [ ! -f "$POD_LOGS_DIR/multi-container-pod.sidecar.previous.log" ] +} + +@test "build_context: respects POD_LOG_TAIL_LINES env var" { + export POD_LOG_TAIL_LINES=42 + + # Capture the kubectl invocation to verify --tail value + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"crash-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Pending","containerStatuses":[{"name":"app","restartCount":1,"state":{"waiting":{"reason":"ImagePullBackOff"}},"lastState":{}}]} + }]}' + ;; + *"logs"*"--tail=42"*) echo "tail-42-honored" ;; + *"logs"*) echo "WRONG: tail value was not 42" ;; + *"describe"*) echo "describe" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + log_content=$(cat "$POD_LOGS_DIR/crash-pod.app.log") + assert_contains "$log_content" "tail-42-honored" + + unset POD_LOG_TAIL_LINES +} diff --git a/k8s/diagnose/tests/diagnose_utils.bats b/k8s/diagnose/tests/diagnose_utils.bats index 4080bd72..bb218b81 100644 --- a/k8s/diagnose/tests/diagnose_utils.bats +++ b/k8s/diagnose/tests/diagnose_utils.bats @@ -77,6 +77,66 @@ strip_ansi() { assert_contains "$clean" "🔧 Action message" } +# ============================================================================= +# evidence_json +# ============================================================================= +@test "evidence_json: builds full schema from all arguments" { + result=$(evidence_json "Test summary" "critical" '["pod-1","pod-2"]' '{"k":"v"}' '["fix it"]') + + summary=$(echo "$result" | jq -r '.summary') + assert_equal "$summary" "Test summary" + + severity=$(echo "$result" | jq -r '.severity') + assert_equal "$severity" "critical" + + affected=$(echo "$result" | jq -c '.affected') + assert_equal "$affected" '["pod-1","pod-2"]' + + details_k=$(echo "$result" | jq -r '.details.k') + assert_equal "$details_k" "v" + + action_0=$(echo "$result" | jq -r '.suggested_actions[0]') + assert_equal "$action_0" "fix it" +} + +@test "evidence_json: applies sane defaults for empty optional fields" { + result=$(evidence_json "Quick check" "info" "" "" "") + + affected=$(echo "$result" | jq -c '.affected') + assert_equal "$affected" "[]" + + details=$(echo "$result" | jq -c '.details') + assert_equal "$details" "{}" + + actions=$(echo "$result" | jq -c '.suggested_actions') + assert_equal "$actions" "[]" +} + +@test "evidence_json: emits valid JSON consumable by update_check_result" { + result=$(evidence_json "S" "warning" '["x"]' '{"a":1}' '["b"]') + + # Should be parseable by jq without errors + parsed=$(echo "$result" | jq -c .) + assert_equal "$parsed" '{"summary":"S","severity":"warning","affected":["x"],"details":{"a":1},"suggested_actions":["b"]}' +} + +# ============================================================================= +# exit_code_meaning +# ============================================================================= +@test "exit_code_meaning: maps known codes" { + assert_equal "$(exit_code_meaning 0)" "Clean exit (container finished successfully)" + assert_equal "$(exit_code_meaning 1)" "Application error" + assert_equal "$(exit_code_meaning 137)" "OOMKilled (out of memory)" + assert_equal "$(exit_code_meaning 139)" "SIGSEGV (segmentation fault)" + assert_equal "$(exit_code_meaning 143)" "SIGTERM (graceful termination)" +} + +@test "exit_code_meaning: returns Unknown for unmapped codes" { + assert_equal "$(exit_code_meaning 42)" "Unknown" + assert_equal "$(exit_code_meaning N/A)" "Unknown" + assert_equal "$(exit_code_meaning '')" "Unknown" +} + # ============================================================================= # require_resources # ============================================================================= @@ -97,6 +157,41 @@ strip_ansi() { assert_contains "$clean" "⚠ No pods found with labels app=test in namespace default, check was skipped." } +@test "require_resources: emits skipped evidence following the schema" { + # Capture the evidence passed to update_check_result + local captured_evidence_file="$(mktemp)" + update_check_result() { + while [[ $# -gt 0 ]]; do + case "$1" in + --evidence) echo "$2" > "$captured_evidence_file"; shift 2 ;; + *) shift ;; + esac + done + return 0 + } + export -f update_check_result + + require_resources "pods" "" "scope_id=999" "production" || true + + # Validate the schema of the captured evidence + local evidence + evidence=$(cat "$captured_evidence_file") + + severity=$(echo "$evidence" | jq -r '.severity') + assert_equal "$severity" "info" + + summary=$(echo "$evidence" | jq -r '.summary') + assert_contains "$summary" "skipped" + + resource_type=$(echo "$evidence" | jq -r '.details.resource_type') + assert_equal "$resource_type" "pods" + + label_selector=$(echo "$evidence" | jq -r '.details.label_selector') + assert_equal "$label_selector" "scope_id=999" + + rm -f "$captured_evidence_file" +} + # ============================================================================= # require_pods / require_services / require_ingresses # ============================================================================= @@ -255,7 +350,7 @@ strip_ansi() { # ============================================================================= # update_check_result - Log Limits # ============================================================================= -@test "update_check_result: limits logs to 20 lines" { +@test "update_check_result: limits logs to 20 lines by default" { for i in {1..30}; do echo "log line $i" >> "$SCRIPT_LOG_FILE" done @@ -266,6 +361,33 @@ strip_ansi() { [ "$logs_count" -le 20 ] } +@test "update_check_result: --log-tail-lines overrides the default cap" { + for i in {1..100}; do + echo "log line $i" >> "$SCRIPT_LOG_FILE" + done + + update_check_result --status "success" --evidence "{}" --log-tail-lines 80 + + logs_count=$(jq -r '.logs | length' "$SCRIPT_OUTPUT_FILE") + [ "$logs_count" = "80" ] + # Last entry should be the most recent line (line 100), oldest in window is line 21 + [ "$(jq -r '.logs[-1]' "$SCRIPT_OUTPUT_FILE")" = "log line 100" ] + [ "$(jq -r '.logs[0]' "$SCRIPT_OUTPUT_FILE")" = "log line 21" ] +} + +@test "update_check_result: --log-tail-lines below total preserves the most recent N lines" { + for i in {1..10}; do + echo "log line $i" >> "$SCRIPT_LOG_FILE" + done + + update_check_result --status "success" --evidence "{}" --log-tail-lines 5 + + logs_count=$(jq -r '.logs | length' "$SCRIPT_OUTPUT_FILE") + [ "$logs_count" = "5" ] + [ "$(jq -r '.logs[0]' "$SCRIPT_OUTPUT_FILE")" = "log line 6" ] + [ "$(jq -r '.logs[-1]' "$SCRIPT_OUTPUT_FILE")" = "log line 10" ] +} + # ============================================================================= # notify_results # ============================================================================= diff --git a/k8s/diagnose/tests/evidence_schema.bats b/k8s/diagnose/tests/evidence_schema.bats new file mode 100644 index 00000000..5309ba2d --- /dev/null +++ b/k8s/diagnose/tests/evidence_schema.bats @@ -0,0 +1,453 @@ +#!/usr/bin/env bats +# ============================================================================= +# Cross-cutting schema validation for all migrated checks. +# Verifies every check writes evidence following the documented schema: +# { summary, severity, affected, details, suggested_actions } +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + source "$BATS_TEST_DIRNAME/../utils/diagnose_utils" + + export NAMESPACE="test-ns" + export LABEL_SELECTOR="app=test" + export SCOPE_LABEL_SELECTOR="scope_id=123" + export DEPLOYMENT_ID="deploy-1" + export NP_OUTPUT_DIR="$(mktemp -d)" + export SCRIPT_OUTPUT_FILE="$(mktemp)" + export SCRIPT_LOG_FILE="$(mktemp)" + echo '{"status":"pending","evidence":{},"logs":[]}' > "$SCRIPT_OUTPUT_FILE" + + # Set up empty data files so every check can require_* + export PODS_FILE="$(mktemp)" + export SERVICES_FILE="$(mktemp)" + export ENDPOINTS_FILE="$(mktemp)" + export INGRESSES_FILE="$(mktemp)" + export SECRETS_FILE="$(mktemp)" + export INGRESSCLASSES_FILE="$(mktemp)" + export EVENTS_FILE="$(mktemp)" + export ALB_CONTROLLER_PODS_FILE="$(mktemp)" + export ALB_CONTROLLER_LOGS_DIR="$(mktemp -d)" + for f in "$PODS_FILE" "$SERVICES_FILE" "$ENDPOINTS_FILE" "$INGRESSES_FILE" \ + "$SECRETS_FILE" "$INGRESSCLASSES_FILE" "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE"; do + echo '{"items":[]}' > "$f" + done + + kubectl() { return 0; } + export -f kubectl +} + +teardown() { + rm -rf "$NP_OUTPUT_DIR" "$ALB_CONTROLLER_LOGS_DIR" + rm -f "$SCRIPT_OUTPUT_FILE" "$SCRIPT_LOG_FILE" "$PODS_FILE" "$SERVICES_FILE" \ + "$ENDPOINTS_FILE" "$INGRESSES_FILE" "$SECRETS_FILE" "$INGRESSCLASSES_FILE" \ + "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE" + unset -f kubectl +} + +reset_output() { + echo '{"status":"pending","evidence":{},"logs":[]}' > "$SCRIPT_OUTPUT_FILE" +} + +# Assert that the evidence object on $SCRIPT_OUTPUT_FILE has the canonical schema: +# summary (string), severity in {critical, warning, info}, +# affected (array), details (object), suggested_actions (array) +assert_evidence_schema() { + local check_name="$1" + + local summary severity affected_kind details_kind actions_kind + summary=$(jq -r '.evidence.summary // empty' "$SCRIPT_OUTPUT_FILE") + severity=$(jq -r '.evidence.severity // empty' "$SCRIPT_OUTPUT_FILE") + affected_kind=$(jq -r '.evidence.affected | type' "$SCRIPT_OUTPUT_FILE") + details_kind=$(jq -r '.evidence.details | type' "$SCRIPT_OUTPUT_FILE") + actions_kind=$(jq -r '.evidence.suggested_actions | type' "$SCRIPT_OUTPUT_FILE") + + [[ -n "$summary" ]] || { + echo "[$check_name] missing evidence.summary" + cat "$SCRIPT_OUTPUT_FILE" + return 1 + } + + case "$severity" in + critical|warning|info) ;; + *) echo "[$check_name] invalid severity: '$severity'"; return 1 ;; + esac + + [[ "$affected_kind" == "array" ]] || { echo "[$check_name] evidence.affected must be array, got $affected_kind"; return 1; } + [[ "$details_kind" == "object" ]] || { echo "[$check_name] evidence.details must be object, got $details_kind"; return 1; } + [[ "$actions_kind" == "array" ]] || { echo "[$check_name] evidence.suggested_actions must be array, got $actions_kind"; return 1; } +} + +# ============================================================================= +# Schema validation: skipped path (require_*) +# All checks that call require_pods/services/ingresses must produce schema +# evidence when the resource list is empty. +# ============================================================================= +SCOPE_CHECKS_REQUIRE_PODS=( + image_pull_status memory_limits_check resource_availability storage_mounting + container_port_health health_probe_endpoints pod_readiness container_crash_detection +) + +SERVICE_CHECKS_REQUIRE_SERVICES=( + service_selector_match service_endpoints service_port_configuration service_type_validation +) + +NETWORKING_CHECKS_REQUIRE_INGRESSES=( + ingress_class_validation ingress_host_rules ingress_backend_service + ingress_tls_configuration ingress_controller_sync alb_capacity_check +) + +@test "schema: scope checks emit valid skipped evidence when no pods" { + for check in "${SCOPE_CHECKS_REQUIRE_PODS[@]}"; do + reset_output + source "$BATS_TEST_DIRNAME/../scope/$check" || true + assert_evidence_schema "scope/$check (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "scope/$check expected status=skipped, got $status"; return 1; } + done +} + +@test "schema: service checks emit valid skipped evidence when no services" { + for check in "${SERVICE_CHECKS_REQUIRE_SERVICES[@]}"; do + reset_output + source "$BATS_TEST_DIRNAME/../service/$check" || true + assert_evidence_schema "service/$check (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "service/$check expected status=skipped, got $status"; return 1; } + done +} + +@test "schema: networking checks emit valid skipped evidence when no ingresses" { + for check in "${NETWORKING_CHECKS_REQUIRE_INGRESSES[@]}"; do + reset_output + source "$BATS_TEST_DIRNAME/../networking/$check" || true + assert_evidence_schema "networking/$check (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "networking/$check expected status=skipped, got $status"; return 1; } + done +} + +@test "schema: logs/application_log_evidence emits valid skipped evidence when no snapshot" { + reset_output + # PROBLEMATIC_PODS_FILE intentionally not set — the check must degrade gracefully. + unset PROBLEMATIC_PODS_FILE + source "$BATS_TEST_DIRNAME/../logs/application_log_evidence" || true + assert_evidence_schema "logs/application_log_evidence (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "expected skipped, got $status"; return 1; } +} + +# ============================================================================= +# Schema validation: failed path for "no resources" existence checks +# (these don't use require_*; they emit failed evidence directly) +# ============================================================================= +@test "schema: pod_existence emits valid failed evidence when no pods" { + reset_output + echo '{"items":[]}' > "$PODS_FILE" + source "$BATS_TEST_DIRNAME/../scope/pod_existence" || true + assert_evidence_schema "scope/pod_existence (failed)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "failed" ]] || { echo "expected failed, got $status"; return 1; } + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + [[ "$severity" == "critical" ]] || { echo "expected critical, got $severity"; return 1; } +} + +@test "schema: service_existence emits valid failed evidence when no services" { + reset_output + echo '{"items":[]}' > "$SERVICES_FILE" + source "$BATS_TEST_DIRNAME/../service/service_existence" || true + assert_evidence_schema "service/service_existence (failed)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "failed" ]] || return 1 +} + +@test "schema: ingress_existence emits valid failed evidence when no ingresses" { + reset_output + echo '{"items":[]}' > "$INGRESSES_FILE" + source "$BATS_TEST_DIRNAME/../networking/ingress_existence" || true + assert_evidence_schema "networking/ingress_existence (failed)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "failed" ]] || return 1 +} + +# ============================================================================= +# Schema validation: success path for existence checks +# ============================================================================= +@test "schema: existence checks emit valid info evidence when resources exist" { + # pod_existence + reset_output + echo '{"items":[{"metadata":{"name":"p1"}}]}' > "$PODS_FILE" + source "$BATS_TEST_DIRNAME/../scope/pod_existence" || true + assert_evidence_schema "scope/pod_existence (success)" + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "info" ]] || return 1 + + # service_existence + reset_output + echo '{"items":[{"metadata":{"name":"s1"}}]}' > "$SERVICES_FILE" + source "$BATS_TEST_DIRNAME/../service/service_existence" || true + assert_evidence_schema "service/service_existence (success)" + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "info" ]] || return 1 + + # ingress_existence + reset_output + echo '{"items":[{"metadata":{"name":"i1"},"spec":{"rules":[]}}]}' > "$INGRESSES_FILE" + source "$BATS_TEST_DIRNAME/../networking/ingress_existence" || true + assert_evidence_schema "networking/ingress_existence (success)" + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "info" ]] || return 1 +} + +# ============================================================================= +# A few targeted "critical" path checks with realistic failure data +# ============================================================================= +@test "schema: image_pull_status emits valid critical evidence with affected pods" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "p1"}, + "spec": {"containers":[{"name":"app","image":"foo:bar"}]}, + "status": {"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"ImagePullBackOff","message":"pull failed"}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/image_pull_status" || true + assert_evidence_schema "scope/image_pull_status (failed)" + + [[ "$(jq -r '.status' "$SCRIPT_OUTPUT_FILE")" == "failed" ]] || return 1 + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "critical" ]] || return 1 + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + [[ "$affected" == '["p1"]' ]] || { echo "expected affected=[p1], got $affected"; return 1; } +} + +@test "schema: memory_limits_check emits valid critical evidence on OOMKilled" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "oom-pod"}, + "spec": {"containers":[{"name":"app","resources":{"limits":{"memory":"128Mi"},"requests":{"memory":"64Mi"}}}]}, + "status": {"containerStatuses":[{"name":"app","lastState":{"terminated":{"reason":"OOMKilled","exitCode":137}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/memory_limits_check" || true + assert_evidence_schema "scope/memory_limits_check (failed)" + + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "critical" ]] || return 1 + oom=$(jq -r '.evidence.details.oom_killed[0].memory_limit' "$SCRIPT_OUTPUT_FILE") + [[ "$oom" == "128Mi" ]] || { echo "expected memory_limit=128Mi, got $oom"; return 1; } +} + +@test "schema: resource_availability emits valid critical evidence with insufficient_cpu flag" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "unsched"}, + "status": {"phase":"Pending","conditions":[{"type":"PodScheduled","status":"False","reason":"Unschedulable","message":"0/3 nodes available: insufficient cpu"}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/resource_availability" || true + assert_evidence_schema "scope/resource_availability (failed)" + + cpu=$(jq -r '.evidence.details.cluster_insufficient_cpu' "$SCRIPT_OUTPUT_FILE") + [[ "$cpu" == "true" ]] || { echo "expected insufficient_cpu=true, got $cpu"; return 1; } +} + +@test "schema: ingress_class_validation emits valid critical evidence on missing class" { + reset_output + cat > "$INGRESSES_FILE" << 'EOF' +{ + "items":[{"metadata":{"name":"my-ing"},"spec":{"ingressClassName":"missing-class"}}] +} +EOF + echo '{"items":[]}' > "$INGRESSCLASSES_FILE" + source "$BATS_TEST_DIRNAME/../networking/ingress_class_validation" || true + assert_evidence_schema "networking/ingress_class_validation (failed)" + + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + [[ "$affected" == '["my-ing"]' ]] || return 1 +} + +# ============================================================================= +# Embedded logs in evidence (Fase C — for AI post-mortem consumption) +# ============================================================================= + +# Helper: prepare a fake POD_LOGS_DIR with current/previous logs for a given pod+container +setup_pod_logs() { + local pod="$1" container="$2" current="$3" previous="$4" + export POD_LOGS_DIR="${POD_LOGS_DIR:-$(mktemp -d)}" + echo "$current" > "$POD_LOGS_DIR/${pod}.${container}.log" + if [[ -n "$previous" ]]; then + echo "$previous" > "$POD_LOGS_DIR/${pod}.${container}.previous.log" + fi +} + +@test "logs: read_log_tail returns [] when POD_LOGS_DIR unset or file missing" { + unset POD_LOGS_DIR + result=$(read_log_tail "any-pod" "any-container" "current") + [[ "$result" == "[]" ]] || { echo "expected [], got $result"; return 1; } + + export POD_LOGS_DIR="$(mktemp -d)" + result=$(read_log_tail "missing-pod" "missing-container" "previous") + [[ "$result" == "[]" ]] || { echo "expected [], got $result"; return 1; } +} + +@test "logs: read_log_tail returns lines as JSON array when log exists" { + export POD_LOGS_DIR="$(mktemp -d)" + printf 'line1\nline2\nline3\n' > "$POD_LOGS_DIR/p.c.log" + + result=$(read_log_tail "p" "c" "current") + count=$(echo "$result" | jq 'length') + [[ "$count" == "3" ]] || { echo "expected 3 lines, got $count"; return 1; } + + first=$(echo "$result" | jq -r '.[0]') + [[ "$first" == "line1" ]] || { echo "expected line1, got $first"; return 1; } +} + +@test "logs: read_log_tail respects EVIDENCE_LOG_TAIL_LINES" { + export POD_LOGS_DIR="$(mktemp -d)" + for i in $(seq 1 100); do echo "line $i"; done > "$POD_LOGS_DIR/p.c.log" + + EVIDENCE_LOG_TAIL_LINES=5 result=$(read_log_tail "p" "c" "current") + count=$(echo "$result" | jq 'length') + [[ "$count" == "5" ]] || { echo "expected 5 lines, got $count"; return 1; } + + # Last 5 lines should be 96..100 + last=$(echo "$result" | jq -r '.[-1]') + [[ "$last" == "line 100" ]] || { echo "expected 'line 100', got '$last'"; return 1; } +} + +@test "logs: container_crash_detection embeds previous logs in CrashLoopBackOff fact" { + reset_output + setup_pod_logs "crash-pod" "app" "" "Caused by: NullPointerException at line 42" + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"crash-pod"}, + "status":{"containerStatuses":[{"name":"app","restartCount":5,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1,"reason":"Error"}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/container_crash_detection" || true + + prev_logs=$(jq -c '.evidence.details.crash_loop_back_off[0].previous_logs' "$SCRIPT_OUTPUT_FILE") + [[ "$prev_logs" != "[]" && "$prev_logs" != "null" ]] || { echo "expected non-empty previous_logs, got $prev_logs"; return 1; } + + contains=$(echo "$prev_logs" | jq -r '.[] | select(test("NullPointerException"))' | head -1) + [[ -n "$contains" ]] || { echo "expected NullPointerException in logs, got $prev_logs"; return 1; } +} + +@test "logs: memory_limits_check embeds previous logs in OOMKilled fact" { + reset_output + setup_pod_logs "oom-pod" "app" "" "java.lang.OutOfMemoryError: Java heap space" + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"oom-pod"}, + "spec":{"containers":[{"name":"app","resources":{"limits":{"memory":"128Mi"}}}]}, + "status":{"containerStatuses":[{"name":"app","lastState":{"terminated":{"reason":"OOMKilled","exitCode":137}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/memory_limits_check" || true + + prev_logs=$(jq -c '.evidence.details.oom_killed[0].previous_logs' "$SCRIPT_OUTPUT_FILE") + contains=$(echo "$prev_logs" | jq -r '.[] | select(test("OutOfMemoryError"))' | head -1) + [[ -n "$contains" ]] || { echo "expected OutOfMemoryError in OOM logs, got $prev_logs"; return 1; } +} + +@test "logs: container_port_health embeds current logs in port_not_listening issue" { + # The script needs nc + timeout to exercise the connectivity-fail path. On + # macOS dev hosts timeout (GNU coreutils) is not in PATH by default — skip. + command -v nc >/dev/null 2>&1 && command -v timeout >/dev/null 2>&1 || \ + skip "nc + timeout required to exercise this path" + + reset_output + setup_pod_logs "broken-pod" "app" "ERROR: failed to bind to 0.0.0.0:8080: permission denied" "" + # Pick a port that's almost certainly not listening on 127.0.0.1 + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"broken-pod"}, + "spec":{"containers":[{"name":"app","ports":[{"containerPort":59999}]}]}, + "status":{"phase":"Running","podIP":"127.0.0.1","containerStatuses":[{"name":"app","ready":true,"state":{"running":{}},"restartCount":0}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/container_port_health" || true + + issue_logs=$(jq -c '.evidence.details.issues[0].container_logs // empty' "$SCRIPT_OUTPUT_FILE") + [[ -n "$issue_logs" && "$issue_logs" != "null" ]] || { echo "expected container_logs in issue"; cat "$SCRIPT_OUTPUT_FILE"; return 1; } + + contains=$(echo "$issue_logs" | jq -r '.[] | select(test("failed to bind"))' | head -1) + [[ -n "$contains" ]] || { echo "expected 'failed to bind' in logs, got $issue_logs"; return 1; } +} + +@test "logs: pod_readiness embeds current logs only for stuck (not_ready) pods" { + # not_ready (failure path) → should embed logs + reset_output + setup_pod_logs "stuck-pod" "app" "INFO: connecting to db... still trying" "" + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"stuck-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Running","conditions":[{"type":"Ready","status":"False","reason":"ContainersNotReady"}],"containerStatuses":[{"name":"app","ready":false,"state":{"running":{}},"restartCount":0}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/pod_readiness" || true + + pod_state=$(jq -r '.evidence.details.pods[0].state' "$SCRIPT_OUTPUT_FILE") + [[ "$pod_state" == "not_ready" ]] || { echo "expected state=not_ready, got $pod_state"; return 1; } + + pod_logs=$(jq -c '.evidence.details.pods[0].container_logs' "$SCRIPT_OUTPUT_FILE") + contains=$(echo "$pod_logs" | jq -r '.[].current_logs[] | select(test("connecting to db"))' | head -1) + [[ -n "$contains" ]] || { echo "expected 'connecting to db' in logs, got $pod_logs"; return 1; } + + # starting (warning path) → should NOT embed logs (avoid noise during normal startup) + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"starting-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Pending","conditions":[{"type":"Ready","status":"False"}],"containerStatuses":[{"name":"app","ready":false,"state":{"waiting":{"reason":"ContainerCreating"}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/pod_readiness" || true + + starting_logs=$(jq -c '.evidence.details.pods[0].container_logs' "$SCRIPT_OUTPUT_FILE") + [[ "$starting_logs" == "[]" ]] || { echo "expected empty container_logs for starting pod, got $starting_logs"; return 1; } +} + +@test "logs: success path does not embed logs (keeps payload light)" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"happy-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Running","conditions":[{"type":"Ready","status":"True"}],"containerStatuses":[{"name":"app","ready":true,"state":{"running":{}},"restartCount":0}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/container_crash_detection" || true + + details=$(jq -c '.evidence.details' "$SCRIPT_OUTPUT_FILE") + # Success details should not have crash_loop_back_off populated with logs + has_logs=$(echo "$details" | jq -r '.. | objects | select(has("previous_logs") or has("current_logs")) | "yes"' | head -1) + [[ -z "$has_logs" ]] || { echo "expected no logs in success evidence, got: $details"; return 1; } +} diff --git a/k8s/diagnose/tests/logs/application_log_evidence.bats b/k8s/diagnose/tests/logs/application_log_evidence.bats new file mode 100644 index 00000000..6daaff8c --- /dev/null +++ b/k8s/diagnose/tests/logs/application_log_evidence.bats @@ -0,0 +1,242 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for diagnose/logs/application_log_evidence +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + source "$BATS_TEST_DIRNAME/../../utils/diagnose_utils" + + export NAMESPACE="test-ns" + export LABEL_SELECTOR="app=test" + export NP_OUTPUT_DIR="$(mktemp -d)" + export SCRIPT_OUTPUT_FILE="$(mktemp)" + export SCRIPT_LOG_FILE="$(mktemp)" + echo '{"status":"pending","evidence":{},"logs":[]}' > "$SCRIPT_OUTPUT_FILE" + + export PODS_FILE="$(mktemp)" + export DATA_DIR="$(mktemp -d)" + export POD_LOGS_DIR="$DATA_DIR/pod_logs" + export PROBLEMATIC_PODS_FILE="$DATA_DIR/problematic_pods.txt" + mkdir -p "$POD_LOGS_DIR" + export EVIDENCE_LOG_TAIL_LINES=50 +} + +teardown() { + rm -rf "$NP_OUTPUT_DIR" "$DATA_DIR" + rm -f "$SCRIPT_OUTPUT_FILE" "$SCRIPT_LOG_FILE" "$PODS_FILE" +} + +evidence() { + jq -r "$1" "$SCRIPT_OUTPUT_FILE" +} + +# ============================================================================= +# Snapshot-unavailable path +# ============================================================================= +@test "logs/application_log_evidence: skipped when PROBLEMATIC_PODS_FILE missing" { + rm -f "$PROBLEMATIC_PODS_FILE" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.status')" = "skipped" ] + [ "$(evidence '.evidence.severity')" = "info" ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] +} + +# ============================================================================= +# No problematic pods +# ============================================================================= +@test "logs/application_log_evidence: success with zero counters when no problematic pods" { + : > "$PROBLEMATIC_PODS_FILE" + echo '{"items":[]}' > "$PODS_FILE" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.status')" = "success" ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "0" ] + assert_contains "$(evidence '.evidence.summary')" "No problematic pods" +} + +# ============================================================================= +# Focuses on the application container only — sidecars are not echoed +# ============================================================================= +@test "logs/application_log_evidence: echoes only application logs (ignores sidecars)" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{ + "metadata":{"name":"pod-1"}, + "spec":{"containers":[{"name":"http"},{"name":"application"}]} +}]} +EOF + echo "nginx sidecar noise" > "$POD_LOGS_DIR/pod-1.http.log" + printf 'starting...\nERROR: missing DATABASE_URL\n' > "$POD_LOGS_DIR/pod-1.application.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # Header + lines prefixed with "| " appear in stdout (captured by UI logs[]) + assert_contains "$output" "application log tail from pod-1" + assert_contains "$output" "| starting..." + assert_contains "$output" "| ERROR: missing DATABASE_URL" + # Sidecar must NOT leak + if [[ "$output" == *"nginx sidecar noise"* ]]; then + echo "Sidecar log leaked into stdout" + return 1 + fi + # Affected lists the pod, counters reflect success + [ "$(evidence '.evidence.affected[0]')" = "pod-1" ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "1" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "1" ] +} + +# ============================================================================= +# Evidence has NO log text — only counters +# ============================================================================= +@test "logs/application_log_evidence: evidence.details exposes only counters, never log text" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + echo "secret log line that must not appear in evidence" > "$POD_LOGS_DIR/pod-1.application.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # details has only the two counters, no pods array, no logs field + local keys + keys=$(jq -r '.evidence.details | keys | sort | join(",")' "$SCRIPT_OUTPUT_FILE") + [ "$keys" = "pods_with_logs,problematic_pod_count" ] + # The log text must not appear anywhere in the evidence object + if [[ "$(jq -c '.evidence' "$SCRIPT_OUTPUT_FILE")" == *"secret log line"* ]]; then + echo "Log text leaked into evidence" + return 1 + fi + # But it MUST appear in stdout + assert_contains "$output" "| secret log line" +} + +# ============================================================================= +# Previous + current are merged into a single chronological stream on stdout +# ============================================================================= +@test "logs/application_log_evidence: stdout shows previous logs first, then current" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + echo "current run" > "$POD_LOGS_DIR/pod-1.application.log" + echo "previous crash output" > "$POD_LOGS_DIR/pod-1.application.previous.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # Both lines must appear in stdout + assert_contains "$output" "| previous crash output" + assert_contains "$output" "| current run" + # And previous must come before current + local prev_line current_line + prev_line=$(printf '%s\n' "$output" | grep -n "previous crash output" | head -1 | cut -d: -f1) + current_line=$(printf '%s\n' "$output" | grep -n "current run" | head -1 | cut -d: -f1) + [ "$prev_line" -lt "$current_line" ] || { echo "Expected previous to print before current"; return 1; } +} + +# ============================================================================= +# Caps logs to last 50 lines (EVIDENCE_LOG_TAIL_LINES default) +# ============================================================================= +@test "logs/application_log_evidence: caps echoed logs to the last 50 lines" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + for i in $(seq 1 30); do echo "prev-$i" >> "$POD_LOGS_DIR/pod-1.application.previous.log"; done + for i in $(seq 1 30); do echo "curr-$i" >> "$POD_LOGS_DIR/pod-1.application.log"; done + + export EVIDENCE_LOG_TAIL_LINES=50 + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # 60 input lines, capped at 50 → the first 10 previous lines must drop off + if [[ "$output" == *"| prev-1"$'\n'* || "$output" == *"| prev-1 "* ]]; then + : # 'prev-1' is a prefix of 'prev-10', need stricter match + fi + # Stricter check: 'prev-10' should not appear because only prev-11..30 + curr-1..30 fit + if printf '%s\n' "$output" | grep -qE '\| prev-10$'; then + echo "Expected prev-10 to be dropped (out of tail-50 window)" + return 1 + fi + # But prev-11 should be there (first survivor) + printf '%s\n' "$output" | grep -qE '\| prev-11$' || { echo "Expected prev-11 to survive"; return 1; } + # And the latest current line is the last visible + printf '%s\n' "$output" | grep -qE '\| curr-30$' || { echo "Expected curr-30 to survive"; return 1; } +} + +# ============================================================================= +# Pod without application container is skipped +# ============================================================================= +@test "logs/application_log_evidence: skips pods that have no application container" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"sidecar-only"}]}}]} +EOF + echo "irrelevant" > "$POD_LOGS_DIR/pod-1.sidecar-only.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "1" ] + assert_contains "$(evidence '.evidence.summary')" "No application logs available" +} + +# ============================================================================= +# Pod has application container but it produced no logs +# ============================================================================= +@test "logs/application_log_evidence: drops pod whose application container has no logs" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + # No log files + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] + assert_contains "$(evidence '.evidence.summary')" "image may never have started" +} + +# ============================================================================= +# Multiple pods aggregated +# ============================================================================= +@test "logs/application_log_evidence: aggregates affected across multiple pods" { + printf 'pod-a\npod-b\npod-c\n' > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{ + "items":[ + {"metadata":{"name":"pod-a"},"spec":{"containers":[{"name":"application"}]}}, + {"metadata":{"name":"pod-b"},"spec":{"containers":[{"name":"application"}]}}, + {"metadata":{"name":"pod-c"},"spec":{"containers":[{"name":"application"}]}} + ] +} +EOF + echo "log of A" > "$POD_LOGS_DIR/pod-a.application.log" + echo "log of C" > "$POD_LOGS_DIR/pod-c.application.log" + # pod-b has no log file + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "2" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "3" ] + local affected + affected=$(evidence '.evidence.affected | sort | join(",")') + [ "$affected" = "pod-a,pod-c" ] + # Both visible in stdout, pod-b absent + assert_contains "$output" "| log of A" + assert_contains "$output" "| log of C" +} diff --git a/k8s/diagnose/tests/scope/container_crash_detection.bats b/k8s/diagnose/tests/scope/container_crash_detection.bats index c0a17c44..2ee31844 100644 --- a/k8s/diagnose/tests/scope/container_crash_detection.bats +++ b/k8s/diagnose/tests/scope/container_crash_detection.bats @@ -268,3 +268,103 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "failed" } + +# ============================================================================= +# Evidence Schema Tests +# ============================================================================= +@test "scope/container_crash_detection: success evidence follows schema" { + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "healthy-pod"}, + "status": {"containerStatuses": [{"name": "app", "ready": true, "restartCount": 0, "state": {"running": {}}}]} + }] +} +EOF + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + assert_equal "$severity" "info" + + summary=$(jq -r '.evidence.summary' "$SCRIPT_OUTPUT_FILE") + assert_contains "$summary" "running without crashes" + + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + assert_equal "$affected" "[]" + + pods_checked=$(jq -r '.evidence.details.pods_checked' "$SCRIPT_OUTPUT_FILE") + assert_equal "$pods_checked" "1" +} + +@test "scope/container_crash_detection: failed evidence includes affected pods and crash details" { + cat > "$PODS_FILE" << 'EOF' +{ + "items": [ + { + "metadata": {"name": "crash-1"}, + "status": {"containerStatuses": [{"name": "app", "restartCount": 5, "state": {"waiting": {"reason": "CrashLoopBackOff"}}, "lastState": {"terminated": {"exitCode": 137, "reason": "OOMKilled"}}}]} + }, + { + "metadata": {"name": "healthy"}, + "status": {"containerStatuses": [{"name": "app", "ready": true, "restartCount": 0, "state": {"running": {}}}]} + } + ] +} +EOF + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + assert_equal "$severity" "critical" + + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + assert_contains "$affected" "crash-1" + + oom_count=$(jq -r '.evidence.details.counts.oom_killed' "$SCRIPT_OUTPUT_FILE") + assert_equal "$oom_count" "1" + + crash_pod=$(jq -r '.evidence.details.crash_loop_back_off[0].pod' "$SCRIPT_OUTPUT_FILE") + assert_equal "$crash_pod" "crash-1" + + exit_code=$(jq -r '.evidence.details.crash_loop_back_off[0].exit_code' "$SCRIPT_OUTPUT_FILE") + assert_equal "$exit_code" "137" + + exit_meaning=$(jq -r '.evidence.details.crash_loop_back_off[0].exit_code_meaning' "$SCRIPT_OUTPUT_FILE") + assert_contains "$exit_meaning" "OOMKilled" + + # Suggested actions should not be empty + actions_count=$(jq -r '.evidence.suggested_actions | length' "$SCRIPT_OUTPUT_FILE") + [ "$actions_count" -gt 0 ] +} + +@test "scope/container_crash_detection: summary highlights OOM count when present" { + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "oom-pod"}, + "status": {"containerStatuses": [{"name": "app", "restartCount": 1, "state": {"waiting": {"reason": "CrashLoopBackOff"}}, "lastState": {"terminated": {"exitCode": 137, "reason": "OOMKilled"}}}]} + }] +} +EOF + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + summary=$(jq -r '.evidence.summary' "$SCRIPT_OUTPUT_FILE") + assert_contains "$summary" "OOMKilled" +} + +@test "scope/container_crash_detection: skipped evidence follows schema with info severity" { + echo '{"items":[]}' > "$PODS_FILE" + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + assert_equal "$status" "skipped" + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + assert_equal "$severity" "info" + + summary=$(jq -r '.evidence.summary' "$SCRIPT_OUTPUT_FILE") + assert_contains "$summary" "skipped" +} diff --git a/k8s/diagnose/tests/scope/container_port_health.bats b/k8s/diagnose/tests/scope/container_port_health.bats index fe60c920..b6605ff2 100644 --- a/k8s/diagnose/tests/scope/container_port_health.bats +++ b/k8s/diagnose/tests/scope/container_port_health.bats @@ -187,7 +187,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "failed" - tested=$(jq -r '.evidence.tested' "$SCRIPT_OUTPUT_FILE") + tested=$(jq -r '.evidence.details.containers_tested' "$SCRIPT_OUTPUT_FILE") assert_equal "$tested" "1" } @@ -429,7 +429,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "skipped" - skipped=$(jq -r '.evidence.skipped' "$SCRIPT_OUTPUT_FILE") + skipped=$(jq -r '.evidence.details.containers_skipped' "$SCRIPT_OUTPUT_FILE") assert_equal "$skipped" "1" } @@ -498,7 +498,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "success" - tested=$(jq -r '.evidence.tested' "$SCRIPT_OUTPUT_FILE") + tested=$(jq -r '.evidence.details.containers_tested' "$SCRIPT_OUTPUT_FILE") assert_equal "$tested" "1" unset -f nc timeout diff --git a/k8s/diagnose/tests/scope/health_probe_endpoints.bats b/k8s/diagnose/tests/scope/health_probe_endpoints.bats index 8a53364b..3621480d 100644 --- a/k8s/diagnose/tests/scope/health_probe_endpoints.bats +++ b/k8s/diagnose/tests/scope/health_probe_endpoints.bats @@ -562,7 +562,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "skipped" - skipped=$(jq -r '.evidence.skipped' "$SCRIPT_OUTPUT_FILE") + skipped=$(jq -r '.evidence.details.containers_skipped' "$SCRIPT_OUTPUT_FILE") assert_equal "$skipped" "1" } @@ -675,7 +675,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "success" - tested=$(jq -r '.evidence.tested' "$SCRIPT_OUTPUT_FILE") + tested=$(jq -r '.evidence.details.containers_tested' "$SCRIPT_OUTPUT_FILE") assert_equal "$tested" "1" } diff --git a/k8s/diagnose/tests/scope/pod_readiness.bats b/k8s/diagnose/tests/scope/pod_readiness.bats index 01625e29..c3f459da 100644 --- a/k8s/diagnose/tests/scope/pod_readiness.bats +++ b/k8s/diagnose/tests/scope/pod_readiness.bats @@ -223,8 +223,8 @@ EOF source "$BATS_TEST_DIRNAME/../../scope/pod_readiness" - ready=$(jq -r '.evidence.ready' "$SCRIPT_OUTPUT_FILE") - total=$(jq -r '.evidence.total' "$SCRIPT_OUTPUT_FILE") + ready=$(jq -r '.evidence.details.ready' "$SCRIPT_OUTPUT_FILE") + total=$(jq -r '.evidence.details.total' "$SCRIPT_OUTPUT_FILE") assert_equal "$ready" "1" assert_equal "$total" "1" } diff --git a/k8s/diagnose/utils/diagnose_utils b/k8s/diagnose/utils/diagnose_utils index 836bc67e..94bac3b8 100644 --- a/k8s/diagnose/utils/diagnose_utils +++ b/k8s/diagnose/utils/diagnose_utils @@ -1,5 +1,27 @@ #!/bin/bash +# ============================================================================= +# Evidence schema (passed to update_check_result --evidence) +# ============================================================================= +# All checks must emit evidence following this schema so the backend / AI summarizer +# can consume results uniformly: +# +# { +# "summary": "string — one-line human-readable summary of findings", +# "severity": "critical" | "warning" | "info", +# "affected": ["resource-name", ...], +# "details": { /* check-specific structured data */ }, +# "suggested_actions": ["actionable guidance items"] +# } +# +# severity mapping: +# - critical: status=failed with actionable data (e.g. pods OOMKilled) +# - warning: status=warning, or partial issues +# - info: status=success or skipped (no action required) +# +# Helper `evidence_json` below builds this schema from primitives. +# ============================================================================= + # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' @@ -7,6 +29,153 @@ YELLOW='\033[1;33m' CYAN='\033[0;36m' NC='\033[0m' # No Color +# Build a JSON evidence object following the schema documented above. +# Usage: +# evidence_json +# Where: +# - summary: plain string +# - severity: "critical" | "warning" | "info" +# - affected_json: JSON array of resource names, e.g. '["pod-1","pod-2"]' +# - details_json: JSON object, check-specific. Pass '{}' for none. +# - actions_json: JSON array of strings. Pass '[]' for none. +evidence_json() { + local summary="$1" + local severity="$2" + local affected="$3" + local details="$4" + local actions="$5" + + # Explicit defaults: bash's ${var:-{}} mis-parses the closing brace, so we + # branch instead of using parameter substitution. + [[ -z "$affected" ]] && affected="[]" + [[ -z "$details" ]] && details="{}" + [[ -z "$actions" ]] && actions="[]" + + jq -n \ + --arg summary "$summary" \ + --arg severity "$severity" \ + --argjson affected "$affected" \ + --argjson details "$details" \ + --argjson actions "$actions" \ + '{ + summary: $summary, + severity: $severity, + affected: $affected, + details: $details, + suggested_actions: $actions + }' +} + +# Translate a Linux container exit code into a human-readable meaning. +# Returns "Unknown" for codes we don't classify. +exit_code_meaning() { + case "$1" in + 0) echo "Clean exit (container finished successfully)" ;; + 1) echo "Application error" ;; + 137) echo "OOMKilled (out of memory)" ;; + 139) echo "SIGSEGV (segmentation fault)" ;; + 143) echo "SIGTERM (graceful termination)" ;; + *) echo "Unknown" ;; + esac +} + +# Read the tail of a pre-collected pod log into a JSON array of lines (one +# line per element). Reads from the pod_logs/ snapshot collected by +# build_context — never makes live kubectl calls. +# +# Usage: +# read_log_tail [lines] +# which: "current" or "previous" +# lines: how many lines to take from the tail (default: $EVIDENCE_LOG_TAIL_LINES, fallback 50) +# +# Returns "[]" if: +# - POD_LOGS_DIR is unset (build_context did not run, or this is a unit test) +# - the log file does not exist (most containers do not have a previous log) +# - the file is empty (container produced no output yet) +# +# Why a snapshot rather than live kubectl: build_context took the snapshot +# point-in-time at fail-time (diagnose runs before rollback). By the time the +# AI summarizer reads evidence, the cluster state has likely moved on, so live +# logs would be misleading or missing. +read_log_tail() { + local pod="$1" + local container="$2" + local which="$3" + local lines="${4:-${EVIDENCE_LOG_TAIL_LINES:-50}}" + + [[ -z "$POD_LOGS_DIR" ]] && { echo "[]"; return 0; } + + local log_file + case "$which" in + previous) log_file="$POD_LOGS_DIR/${pod}.${container}.previous.log" ;; + current) log_file="$POD_LOGS_DIR/${pod}.${container}.log" ;; + *) echo "[]"; return 0 ;; + esac + + if [[ ! -s "$log_file" ]]; then + echo "[]" + return 0 + fi + + tail -n "$lines" "$log_file" | lines_to_json_array +} + +# Convert newline-delimited stdin into a JSON array of non-empty strings. +# Used by read_log_tail and update_check_result to share one canonical +# tail-text-to-JSON pipeline. +lines_to_json_array() { + jq -R -s 'split("\n") | map(select(length > 0))' +} + +# Append a JSON object to a bash indexed array (passed by name). Avoids the +# O(N²) jq round-trip of `arr=$(echo "$arr" | jq --argjson f "$x" '. + [$f]')`. +# Convert with `facts_to_json_array ` once at end of accumulation. +# Uses eval so it works on bash 3.2 (macOS dev) — declare -n requires 4.3+. +add_fact() { + local arr_name="$1" + local value="$2" + eval "${arr_name}+=(\"\$value\")" +} + +# Convert a bash indexed array of compact JSON strings into a single JSON +# array. Empty arrays correctly become "[]". +facts_to_json_array() { + local arr_name="$1" + local count + eval "count=\${#${arr_name}[@]}" + if [[ "$count" -eq 0 ]]; then + echo "[]" + else + eval "printf '%s\n' \"\${${arr_name}[@]}\"" | jq -s '.' + fi +} + +# Mark a resource as affected by an issue. Stores names in a bash variable +# (passed by name) as a space-separated set, deduplicating on add. Replaces +# the per-call jq dedup that was duplicated in every check. +mark_affected() { + local set_name="$1" + local value="$2" + local current + eval "current=\"\${$set_name}\"" + case " $current " in + *" $value "*) ;; + *) eval "$set_name=\"\${current:+\$current }\$value\"" ;; + esac +} + +# Convert a space-separated set (built by mark_affected) into a JSON array. +set_to_json_array() { + local set_name="$1" + local values + eval "values=\"\${$set_name}\"" + if [[ -z "$values" ]]; then + echo "[]" + else + printf '%s\n' $values | jq -R . | jq -s . + fi +} + print_success() { echo -e "${GREEN}✓${NC} $1" } @@ -37,7 +206,15 @@ require_resources() { if [[ -z "$resource_names" ]]; then print_warning "No ${resource_type} found with labels $label_selector in namespace $namespace, check was skipped." - update_check_result --status "skipped" --evidence "{}" + local skip_evidence + skip_evidence=$(evidence_json \ + "No ${resource_type} found, check skipped" \ + "info" \ + "[]" \ + "$(jq -nc --arg rt "$resource_type" --arg ls "$label_selector" --arg ns "$namespace" \ + '{resource_type: $rt, label_selector: $ls, namespace: $ns}')" \ + "[]") + update_check_result --status "skipped" --evidence "$skip_evidence" return 1 fi @@ -66,7 +243,14 @@ update_check_result() { # Usage: # update_check_result "new-status" '{"new":"evidence"}' # or: - # update_check_result --status "new-status" --evidence '{"new":"evidence"}' + # update_check_result --status "new-status" --evidence '{"new":"evidence"}' [--log-tail-lines N] + # + # --log-tail-lines overrides the default 20-line cap on the captured stdout + # tail. Checks that publish application output (e.g. logs/application_log_evidence) + # need a higher cap to fit the log payload alongside their own diagnostic + # prints. + + local log_tail_lines=20 # Argument parsing if [[ "$1" == --* ]]; then @@ -74,6 +258,7 @@ update_check_result() { case $1 in --status) status="$2"; shift 2 ;; --evidence) evidence="$2"; shift 2 ;; + --log-tail-lines) log_tail_lines="$2"; shift 2 ;; *) echo "Unknown parameter: $1" >&2; return 1 ;; esac done @@ -95,11 +280,10 @@ update_check_result() { return 1 fi - # Check if log file exists and read it into an array + # Read script log tail (non-blank lines, capped at $log_tail_lines) into a JSON array local log_array="[]" if [[ -n "$SCRIPT_LOG_FILE" && -f "$SCRIPT_LOG_FILE" ]]; then - # Read log file, remove empty lines, take last 20 lines, and convert to JSON array - log_array=$(grep -v '^[[:space:]]*$' "$SCRIPT_LOG_FILE" | tail -n 20 | jq -R -s 'split("\n") | map(select(length > 0))') + log_array=$(grep -v '^[[:space:]]*$' "$SCRIPT_LOG_FILE" | tail -n "$log_tail_lines" | lines_to_json_array) if [[ $? -ne 0 ]]; then echo "Error: Failed to read log file: $SCRIPT_LOG_FILE" >&2 return 1 diff --git a/k8s/scope/workflows/diagnose.yaml b/k8s/scope/workflows/diagnose.yaml index 66223726..45d837c3 100644 --- a/k8s/scope/workflows/diagnose.yaml +++ b/k8s/scope/workflows/diagnose.yaml @@ -34,4 +34,5 @@ steps: folders: - "$SERVICE_PATH/diagnose/service" - "$SERVICE_PATH/diagnose/scope" - - "$SERVICE_PATH/diagnose/networking" \ No newline at end of file + - "$SERVICE_PATH/diagnose/networking" + - "$SERVICE_PATH/diagnose/logs" \ No newline at end of file From d08ca797efb6bb133fa1b732bb9f25c44aebcede Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 22 May 2026 13:25:07 -0300 Subject: [PATCH 41/56] docs: align ram_memory_limit description with cpu_millicores_limit phrasing --- k8s/specs/service-spec.json.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index 3032936f..f2cd6507 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -387,7 +387,7 @@ "minimum":{ "$data":"1/ram_memory" }, - "description":"Maximum memory the container can use (in MB). Setting this higher than the request increases the chance the scheduler kills the pod under pressure." + "description":"Maximum memory the container can use (in MB). Pick 'Same as request' to leave it equal to the request value." }, "visibility":{ "type":"string", From 88f65ee73bfaf61c8bee4ca690dba8daafa29ffd Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 22 May 2026 13:25:08 -0300 Subject: [PATCH 42/56] chore: move design spec and plan to .claude (untracked working notes) --- .../2026-05-21-clien-781-memory-cpu-limits.md | 525 ------------------ ...5-21-clien-781-memory-cpu-limits-design.md | 171 ------ 2 files changed, 696 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md delete mode 100644 docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md diff --git a/docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md b/docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md deleted file mode 100644 index f954d73b..00000000 --- a/docs/superpowers/plans/2026-05-21-clien-781-memory-cpu-limits.md +++ /dev/null @@ -1,525 +0,0 @@ -# CLIEN-781 — Memory & CPU Limits Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add optional `cpu_millicores_limit` and `ram_memory_limit` capabilities to the k8s scope so the Spin client can set Kubernetes `resources.limits` independently from `resources.requests`, with safe back-compat defaults. - -**Architecture:** Add two new optional properties to the k8s scope spec. Normalize them inside `build_context` (limit defaults to request when null/missing) so the deployment template stays trivial. Render the normalized values into the application container's `resources.limits` while keeping `resources.requests` bound to the original `cpu_millicores` / `ram_memory` fields. - -**Tech Stack:** JSON Schema (with JSONForms uiSchema), bash + jq for context normalization, gomplate for template rendering, BATS for tests. - -**Spec:** [`docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md`](../specs/2026-05-21-clien-781-memory-cpu-limits-design.md) - ---- - -## File Structure - -**Modified files:** - -- `k8s/specs/service-spec.json.tpl` — add two `properties` and update the `Categorization`/`Category` to rename "Processor" → "Resources" and add two new `Control` entries. -- `k8s/deployment/build_context` — add a `normalize_capability_limits` function that mutates `$CONTEXT` to fill `.scope.capabilities.cpu_millicores_limit` and `.scope.capabilities.ram_memory_limit` with the request value when null/missing. Call it before the final context write. -- `k8s/deployment/templates/deployment.yaml.tpl` — application container only (lines 313–319): keep `requests.cpu/memory` bound to `cpu_millicores` / `ram_memory`, change `limits.cpu/memory` to read `cpu_millicores_limit` / `ram_memory_limit`. Sidecars (lines 148–153, 201–206, 255–260) are NOT touched — they use `container_cpu_in_millicores` / `container_memory_in_memory` from a ConfigMap. - -**New tests:** - -- `k8s/deployment/tests/build_context.bats` — add a section for `normalize_capability_limits` covering the four matrix cells (limit set / limit null, for CPU and RAM) plus the "field absent" case. -- `k8s/deployment/tests/deployment_template_shape.bats` (new file) — grep-based structural assertions that the application container `resources` block uses the right field for request vs limit. Mirrors `tests/ingress_template_shape.bats`. - -**Not modified:** sidecar resource blocks, CLI, docsite, API spec. - ---- - -## Task 1: Add `cpu_millicores_limit` and `ram_memory_limit` properties to the JSON schema - -**Files:** -- Modify: `k8s/specs/service-spec.json.tpl` (properties block, lines 485–492 area for CPU; lines 315–358 area for RAM) - -There is no JSON-schema test harness in this repo, so this task has no automated test. The schema is validated implicitly by the deployment workflow and by manual `jq` sanity checks in step 2. - -- [ ] **Step 1: Add the two new properties to `attributes.schema.properties`** - -After the existing `cpu_millicores` property block (end at line 492), add `cpu_millicores_limit`: - -```json -, -"cpu_millicores_limit":{ - "type":["integer","null"], - "title":"CPU Millicores Limit", - "default":null, - "maximum":4000, - "minimum":{ - "$data":"1/cpu_millicores" - }, - "description":"Maximum CPU the container can use (in millicores). Leave empty to use the same value as the request." -} -``` - -After the existing `ram_memory` property block (end at line 358), add `ram_memory_limit`: - -```json -, -"ram_memory_limit":{ - "type":["integer","null"], - "oneOf":[ - {"const":null, "title":"Same as request"}, - {"const":64, "title":"64 MB"}, - {"const":128, "title":"128 MB"}, - {"const":256, "title":"256 MB"}, - {"const":512, "title":"512 MB"}, - {"const":1024, "title":"1 GB"}, - {"const":2048, "title":"2 GB"}, - {"const":4096, "title":"4 GB"}, - {"const":8192, "title":"8 GB"}, - {"const":16384, "title":"16 GB"} - ], - "title":"RAM Memory Limit", - "default":null, - "minimum":{ - "$data":"1/ram_memory" - }, - "description":"Maximum memory the container can use (in MB). Setting this higher than the request increases the chance the scheduler kills the pod under pressure." -} -``` - -Do NOT add either field to the top-level `required` array — both stay optional. - -- [ ] **Step 2: Validate the JSON is still well-formed** - -Run: -```bash -jq empty k8s/specs/service-spec.json.tpl -``` -Expected: no output, exit code 0. - -If gomplate is available locally, also confirm the template renders to valid JSON: -```bash -NRN="nrn:test" gomplate -f k8s/specs/service-spec.json.tpl | jq empty -``` -Expected: no output, exit code 0. - -- [ ] **Step 3: Commit** - -```bash -git add k8s/specs/service-spec.json.tpl -git commit -m "feat: add cpu_millicores_limit and ram_memory_limit properties to k8s scope spec" -``` - ---- - -## Task 2: Rename "Processor" → "Resources" and add the limit Controls to the uiSchema - -**Files:** -- Modify: `k8s/specs/service-spec.json.tpl` (uiSchema `Category` block, lines 46–55) - -No automated test — uiSchema is rendered by the frontend. We validate by grep-based assertion in step 2 and visual smoke later. - -- [ ] **Step 1: Rename the Category label and add two Controls** - -Locate the `Category` whose label is `"Processor"` (line 47). Replace the whole block (lines 46–55) with: - -```json -{ - "type":"Category", - "label":"Resources", - "elements":[ - { - "type":"Control", - "label":"CPU Millicores", - "scope":"#/properties/cpu_millicores" - }, - { - "type":"Control", - "label":"CPU Millicores Limit", - "scope":"#/properties/cpu_millicores_limit" - }, - { - "type":"Control", - "label":"RAM Memory Limit", - "scope":"#/properties/ram_memory_limit" - } - ] -} -``` - -- [ ] **Step 2: Sanity-check the uiSchema is well-formed and has the expected shape** - -Run: -```bash -jq -e ' - .attributes.schema.uiSchema - | .. | objects | select(.label? == "Resources") - | .elements | map(.scope) as $scopes - | ($scopes | length) == 3 - and ($scopes | index("#/properties/cpu_millicores") != null) - and ($scopes | index("#/properties/cpu_millicores_limit") != null) - and ($scopes | index("#/properties/ram_memory_limit") != null) -' k8s/specs/service-spec.json.tpl >/dev/null && echo OK -``` -Expected: `OK`. - -Also confirm "Processor" is gone: -```bash -! grep -q '"Processor"' k8s/specs/service-spec.json.tpl && echo OK -``` -Expected: `OK`. - -- [ ] **Step 3: Commit** - -```bash -git add k8s/specs/service-spec.json.tpl -git commit -m "feat: rename Processor tab to Resources and surface CPU/RAM limit controls" -``` - ---- - -## Task 3: Add `normalize_capability_limits` to `build_context` (TDD) - -**Files:** -- Modify: `k8s/deployment/build_context` -- Modify: `k8s/deployment/tests/build_context.bats` - -This is the back-compat heart of the change. The function takes `$CONTEXT` (JSON) and fills `.scope.capabilities.cpu_millicores_limit` and `.scope.capabilities.ram_memory_limit` with the corresponding request value when the field is `null` or missing. Existing values pass through unchanged. - -- [ ] **Step 1: Write failing tests in `tests/build_context.bats`** - -Append at the end of `k8s/deployment/tests/build_context.bats`: - -```bash -# ============================================================================= -# normalize_capability_limits Function Tests (CLIEN-781) -# Fills in *_limit with the corresponding request value when null or missing, -# leaves explicit values untouched. -# ============================================================================= - -setup_normalize_limits_fn() { - eval "$(sed -n '/^normalize_capability_limits()/,/^}/p' "$PROJECT_ROOT/k8s/deployment/build_context")" -} - -@test "normalize_capability_limits: fills CPU limit from request when limit is absent" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024,"ram_memory_limit":2048}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" -} - -@test "normalize_capability_limits: fills RAM limit from request when limit is absent" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":700,"ram_memory":1024}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" -} - -@test "normalize_capability_limits: fills both limits when both are absent" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" -} - -@test "normalize_capability_limits: fills both limits when both are explicit null" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":null,"ram_memory":1024,"ram_memory_limit":null}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" -} - -@test "normalize_capability_limits: preserves explicit non-null limits" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":2000,"ram_memory":1024,"ram_memory_limit":4096}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "2000" - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "4096" -} -``` - -- [ ] **Step 2: Run the new tests, confirm they fail** - -Run: -```bash -bats k8s/deployment/tests/build_context.bats -f normalize_capability_limits -``` -Expected: 5 failures, message about `normalize_capability_limits: command not found` (or similar — function does not exist yet). - -- [ ] **Step 3: Implement `normalize_capability_limits` in `build_context`** - -Open `k8s/deployment/build_context`. Above the `validate_status()` function (search for `^validate_status\(\)`), insert: - -```bash -# Fill in *_limit capability fields with the corresponding request value when -# the limit is missing or explicitly null. Idempotent. CLIEN-781. -normalize_capability_limits() { - echo "$1" | jq ' - .scope.capabilities.cpu_millicores_limit = (.scope.capabilities.cpu_millicores_limit // .scope.capabilities.cpu_millicores) - | .scope.capabilities.ram_memory_limit = (.scope.capabilities.ram_memory_limit // .scope.capabilities.ram_memory) - ' -} -``` - -Then wire it into the final context assembly. Find the block ending at line 314 (the big `jq '. + { ... }')` invocation around lines 285–314 that produces the final `$CONTEXT`). Immediately after that block (i.e., right before the `DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.deployment.id')` line at 316), add: - -```bash -CONTEXT=$(normalize_capability_limits "$CONTEXT") -``` - -- [ ] **Step 4: Run the new tests, confirm they pass** - -Run: -```bash -bats k8s/deployment/tests/build_context.bats -f normalize_capability_limits -``` -Expected: 5 tests pass. - -- [ ] **Step 5: Run the full build_context test suite to ensure no regressions** - -Run: -```bash -bats k8s/deployment/tests/build_context.bats -``` -Expected: all tests pass (baseline of this file is currently green per the existing CI; we are only adding tests). - -- [ ] **Step 6: Commit** - -```bash -git add k8s/deployment/build_context k8s/deployment/tests/build_context.bats -git commit -m "feat: normalize cpu/ram limit capabilities to request value when unset" -``` - ---- - -## Task 4: Render limits from normalized fields in the application container (TDD via template-shape test) - -**Files:** -- Create: `k8s/deployment/tests/deployment_template_shape.bats` -- Modify: `k8s/deployment/templates/deployment.yaml.tpl` (lines 313–319 only — the application container, NOT the sidecars) - -We assert the template shape with grep (same approach as `ingress_template_shape.bats`). End-to-end rendering through gomplate is exercised by the existing build pipeline; the shape test catches regressions like accidentally rebinding `limits.cpu` back to `cpu_millicores`. - -- [ ] **Step 1: Write the failing template-shape test** - -Create `k8s/deployment/tests/deployment_template_shape.bats`: - -```bash -#!/usr/bin/env bats -# ============================================================================= -# Structural tests for the deployment template. -# Verifies the application container's resources block uses the right -# capability for request vs limit. CLIEN-781. -# ============================================================================= - -setup() { - export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" - source "$PROJECT_ROOT/testing/assertions.sh" - export TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/deployment.yaml.tpl" -} - -# Slice the file from "name: application" to the next container header, -# isolating the application container's block from the sidecars (which keep -# using container_cpu_in_millicores / container_memory_in_memory). -app_container_block() { - awk ' - /^[[:space:]]+- name: application[[:space:]]*$/ { in_app=1 } - in_app { print } - /^[[:space:]]+terminationMessagePolicy:/ && in_app { exit } - ' "$TEMPLATE" -} - -@test "deployment template: application container limits.cpu uses cpu_millicores_limit" { - block=$(app_container_block) - echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores_limit[[:space:]]*\}\}m' >/dev/null -} - -@test "deployment template: application container limits.memory uses ram_memory_limit" { - block=$(app_container_block) - echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory_limit[[:space:]]*\}\}Mi' >/dev/null -} - -@test "deployment template: application container requests.cpu still uses cpu_millicores" { - block=$(app_container_block) - echo "$block" | grep -E 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores[[:space:]]*\}\}m' >/dev/null -} - -@test "deployment template: application container requests.memory still uses ram_memory" { - block=$(app_container_block) - echo "$block" | grep -E 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory[[:space:]]*\}\}Mi' >/dev/null -} - -@test "deployment template: sidecars still use container_cpu_in_millicores / container_memory_in_memory" { - # Sidecars are everything BEFORE the application container block. - before=$(awk '/^[[:space:]]+- name: application[[:space:]]*$/ {exit} {print}' "$TEMPLATE") - echo "$before" | grep -F '{{ .container_cpu_in_millicores }}m' >/dev/null - echo "$before" | grep -F '{{ .container_memory_in_memory }}Mi' >/dev/null - # And sidecars must NOT have been switched to the new fields. - ! echo "$before" | grep -F 'cpu_millicores_limit' >/dev/null - ! echo "$before" | grep -F 'ram_memory_limit' >/dev/null -} -``` - -- [ ] **Step 2: Run the new tests, confirm they fail** - -Run: -```bash -bats k8s/deployment/tests/deployment_template_shape.bats -``` -Expected: at least the first two tests fail (limits.cpu / limits.memory still pointing at `cpu_millicores` / `ram_memory` — request fields). - -- [ ] **Step 3: Edit the application container's resource block** - -Open `k8s/deployment/templates/deployment.yaml.tpl`. Locate lines 313–319 (the `- name: application` container's `resources` block). Replace those exact lines with: - -```yaml - resources: - limits: - cpu: {{ .scope.capabilities.cpu_millicores_limit }}m - memory: {{ .scope.capabilities.ram_memory_limit }}Mi - requests: - cpu: {{ .scope.capabilities.cpu_millicores }}m - memory: {{ .scope.capabilities.ram_memory }}Mi -``` - -Do NOT touch the sidecar `resources:` blocks at lines 148–153, 201–206, or 255–260. - -- [ ] **Step 4: Run the template-shape tests, confirm they pass** - -Run: -```bash -bats k8s/deployment/tests/deployment_template_shape.bats -``` -Expected: all 5 tests pass. - -- [ ] **Step 5: Commit** - -```bash -git add k8s/deployment/templates/deployment.yaml.tpl k8s/deployment/tests/deployment_template_shape.bats -git commit -m "feat: render application container limits from normalized capability fields" -``` - ---- - -## Task 5: End-to-end smoke (manual) - -This is a sanity check, not a test — the project has no automated gomplate-render harness for `deployment.yaml.tpl`. Skip if `gomplate` is not installed locally. - -- [ ] **Step 1: Render the deployment template with a sample CONTEXT and inspect the output** - -```bash -cat > /tmp/clien781_ctx.json <<'JSON' -{ - "scope": { - "id": "scope-test", - "capabilities": { - "cpu_millicores": 500, - "cpu_millicores_limit": 1000, - "ram_memory": 1024, - "ram_memory_limit": 2048, - "health_check": {"enabled": true, "type": "HTTP", "path": "/health", "initial_delay_seconds": 10}, - "additional_ports": [] - } - }, - "deployment": {"id": "deploy-test"}, - "asset": {"url": "example.com/app:1.0"}, - "container_cpu_in_millicores": "93", - "container_memory_in_memory": "64", - "main_http_port": 8080, - "traffic_image": "example.com/traffic:1.0", - "blue_replicas": "0", - "green_replicas": "1", - "total_replicas": "1", - "blue_deployment_id": "", - "pull_secrets": [], - "pdb_enabled": "false", - "pdb_max_unavailable": "1", - "service_account_name": "default", - "traffic_manager_config_map": "tm-config", - "blue_additional_port_services": {} -} -JSON - -gomplate -c .=/tmp/clien781_ctx.json -f k8s/deployment/templates/deployment.yaml.tpl \ - | grep -A4 'name: application' \ - | grep -A3 'resources:' \ - | sed -n '1,8p' -``` - -Expected output should include: -``` - resources: - limits: - cpu: 1000m - memory: 2048Mi - requests: - cpu: 500m - memory: 1024Mi -``` - -- [ ] **Step 2: Render again with the limit fields omitted (back-compat case)** - -Edit `/tmp/clien781_ctx.json` and remove `cpu_millicores_limit` and `ram_memory_limit`. Then re-run the same `gomplate ... | grep` chain. - -**Wait** — gomplate will error on missing keys. This step illustrates that the back-compat path MUST go through `build_context` (which normalizes), not raw template rendering. The build pipeline always runs `build_context` first, so in production this is fine. The manual smoke here just confirms that the normalized context produces the right output; the "missing keys" path is covered by the BATS tests in Task 3. - -- [ ] **Step 3: Clean up** - -```bash -rm /tmp/clien781_ctx.json -``` - ---- - -## Task 6: Run the full k8s test suite and push the branch - -- [ ] **Step 1: Run all k8s BATS tests in batches** (per the project memory rule about avoiding BATS temp-dir collisions) - -Run: -```bash -bats k8s/deployment/tests/build_context.bats -bats k8s/deployment/tests/build_deployment.bats -bats k8s/deployment/tests/deployment_template_shape.bats -bats k8s/deployment/tests/ingress_template_shape.bats -bats k8s/deployment/tests/apply_templates.bats -``` -Expected: all green. - -- [ ] **Step 2: Confirm git status is clean and on the right branch** - -Run: -```bash -git status -git log --oneline beta..HEAD -``` -Expected: clean tree; four feature commits (Tasks 1–4) on top of beta. - -- [ ] **Step 3: Push the branch** - -Run: -```bash -git push -u origin feature/clien-781-memory-cpu-limits -``` - -- [ ] **Step 4: Run the quality-gate skill before opening a PR** - -Per the user's global `CLAUDE.md`, run `quality-gate` after non-trivial coding tasks and before claiming work is done. The skill orchestrates code-review, security audit, and simplification checks. - ---- - -## Out of scope (for follow-up tickets) - -- Docsite documentation for the new capabilities. -- CLI / OpenAPI changes — none required, the capability schema is consumed dynamically. -- Symmetric treatment for other resource dimensions (ephemeral storage, GPUs). -- Sidecar resource overrides — sidecars keep using `container_cpu_in_millicores` / `container_memory_in_memory` from the ConfigMap. - ---- - -## Self-review checklist (done by plan author) - -- [x] **Spec coverage:** every section of the spec (schema, uiSchema, render, back-compat, validation, testing) maps to a task. -- [x] **No placeholders:** every step has concrete code, paths, and expected output. -- [x] **Type consistency:** `normalize_capability_limits` is referenced consistently; field names match the schema (`cpu_millicores_limit`, `ram_memory_limit`). -- [x] **Scope:** single coherent change, one branch, one PR. diff --git a/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md b/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md deleted file mode 100644 index 24741172..00000000 --- a/docs/superpowers/specs/2026-05-21-clien-781-memory-cpu-limits-design.md +++ /dev/null @@ -1,171 +0,0 @@ -# CLIEN-781 — Configurable CPU & RAM limits for k8s scope - -Status: design approved (2026-05-21) -Ticket: https://nullplatform.atlassian.net/browse/CLIEN-781 -Client: Spin -Assignee: Federico Maleh - -## Context - -Today the k8s scope exposes two capabilities — `ram_memory` and `cpu_millicores` — that are used as **both** the Kubernetes request and the Kubernetes limit. The Spin team needs to decouple them so that limits can be set higher than requests when desired, while keeping the default behavior unchanged for existing scopes. - -The risk that drives the UI shape: a memory `limit > request` increases the chance the scheduler/OOMKiller kills a pod under pressure. So memory limit is a sharp tool that should be hidden behind an "advanced" surface, not the main form. - -## Goals - -1. Add `cpu_millicores_limit` and `ram_memory_limit` as optional capabilities. -2. Keep the main form intact — `ram_memory` (request) stays at the top, untouched. -3. Group the new fields with the existing `cpu_millicores` in a renamed `Resources` tab inside the collapsable "ADVANCED" categorization. -4. Validate `limit >= request` at the JSON schema layer. -5. Be backwards compatible: missing or null limit ⇒ fall back to the request value, matching today's render. - -## Non-goals - -- No change to `ram_memory` or `cpu_millicores` themselves (same field types, same defaults). -- No cross-scope validation. -- No docsite update in this ticket (separate PR if requested). -- No CLI/API change beyond what naturally happens by adding properties to the scope spec. - -## UI design - -### Form layout (after the change) - -``` -Main form -├─ RAM Memory (request, dropdown — unchanged) -└─ Visibility - -▼ ADVANCED -├─ Resources ← renamed from "Processor" -│ ├─ CPU Millicores (request — existing) -│ ├─ CPU Millicores Limit ← NEW (optional integer) -│ └─ RAM Memory Limit ← NEW (dropdown with "Same as request") -├─ Size & Scaling -├─ Exposed Ports -├─ Scheduled Stop -├─ Protocol -├─ Continuous deployment -└─ Health Check -``` - -Asymmetry between RAM and CPU is intentional: RAM request stays in the main form (everyone tunes it), RAM limit lives in `Resources` (sharp tool). CPU request and CPU limit both live in `Resources` (CPU was already advanced). - -### Tab rename rationale - -`Resources` follows Kubernetes vocabulary (`resources: requests/limits`) and is generic enough to host both CPU and memory tuning. Alternatives considered (`Compute`, `Compute & Limits`) were rejected as less standard. - -## Schema changes — `k8s/specs/service-spec.json.tpl` - -### New properties (siblings of the existing ones) - -```json -"cpu_millicores_limit": { - "type": ["integer", "null"], - "title": "CPU Millicores Limit", - "default": null, - "oneOf": [ - { "const": null, "title": "Same as request" }, - { "const": 100, "title": "100 m" }, - { "const": 250, "title": "250 m" }, - { "const": 500, "title": "500 m" }, - { "const": 1000, "title": "1000 m" }, - { "const": 2000, "title": "2000 m" }, - { "const": 4000, "title": "4000 m" } - ], - "minimum": { "$data": "1/cpu_millicores" }, - "description": "Maximum CPU the container can use (in millicores). Pick 'Same as request' to leave it equal to the request value." -}, -"ram_memory_limit": { - "type": ["integer", "null"], - "title": "RAM Memory Limit", - "default": null, - "oneOf": [ - { "const": null, "title": "Same as request" }, - { "const": 64, "title": "64 MB" }, - { "const": 128, "title": "128 MB" }, - { "const": 256, "title": "256 MB" }, - { "const": 512, "title": "512 MB" }, - { "const": 1024, "title": "1 GB" }, - { "const": 2048, "title": "2 GB" }, - { "const": 4096, "title": "4 GB" }, - { "const": 8192, "title": "8 GB" }, - { "const": 16384, "title": "16 GB" } - ], - "minimum": { "$data": "1/ram_memory" }, - "description": "Maximum memory the container can use. Setting this higher than the request increases OOMKill risk." -} -``` - -Both properties are added to the `required` array of `attributes.schema`. This is the nullplatform UI's contract: the frontend only renders properties that appear in `required` (established during CLIEN-739). Defaults of `null` keep this non-breaking — existing scopes materialize the default, and `normalize_capability_limits` collapses `null` back to the request value before the deployment template renders. - -### uiSchema changes - -Two edits in the existing `Categorization` block: - -1. Change `"label": "Processor"` → `"label": "Resources"`. -2. Add two `Control` entries inside that category's `elements`: - -```json -{ - "type": "Category", - "label": "Resources", - "elements": [ - { "type": "Control", "label": "CPU Millicores", "scope": "#/properties/cpu_millicores" }, - { "type": "Control", "label": "CPU Millicores Limit", "scope": "#/properties/cpu_millicores_limit" }, - { "type": "Control", "label": "RAM Memory Limit", "scope": "#/properties/ram_memory_limit" } - ] -} -``` - -No SHOW/HIDE rules are needed — the "Same as request" option (RAM) and empty value (CPU) act as the no-op state. - -## Validation - -`minimum` with `$data` references the sibling request field. JSON Schema only applies `minimum` to numeric instances, so `null` (or missing) values skip the check naturally — no `if/then` block required. - -The pattern matches the precedent already in this spec: -`health_check.period_seconds.exclusiveMinimum.$data = "1/timeout_seconds"`. - -## Render logic in the deployment template - -The k8s deployment manifest (currently rendering both request and limit from the same capability) must use the new fields with a jq `// fallback`: - -```bash -CPU_REQ=$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores') -CPU_LIM=$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit // .scope.capabilities.cpu_millicores') - -RAM_REQ=$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory') -RAM_LIM=$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit // .scope.capabilities.ram_memory') -``` - -`// .scope.capabilities.cpu_millicores` evaluates to the request value when the limit is `null` or missing, giving the exact retrocompat the ticket asks for. - -The implementation plan will locate the exact file(s) under `k8s/deployment/` that render `resources:` and apply this change. - -## Backwards compatibility - -| Scenario | Behavior | -|---|---| -| Existing scope, no new properties in DB | jq fallback ⇒ limit = request ⇒ identical manifest to today | -| New scope, user does not touch limits | Defaults are `null` ⇒ same as above | -| New scope, user picks a higher limit | Manifest renders the explicit limit; schema validates `limit ≥ request` | -| User tries `limit < request` | JSON schema rejects via `$data` minimum before the workflow runs | - -No data migration needed. - -## Testing plan (high-level) - -- **BATS unit tests** for the deployment script: cover the four matrix cells (limit set / limit null, for both CPU and RAM), asserting the rendered `resources:` block. -- **JSON schema validation tests** (if a test harness exists for the spec): assert that `limit < request` is rejected and `limit >= request` is accepted, including the `null` case. -- **Manual smoke** in a dev environment after the implementation lands. - -The testing detail belongs to the implementation plan (writing-plans), not this design doc. - -## Open questions - -- Exact deployment template file location and templating engine (gomplate vs helm vs raw bash + jq) — to be confirmed at implementation time. The render logic above is engine-agnostic in spirit but the syntax will be adapted. - -## Out of scope / follow-ups - -- Docsite documentation (under `~/nullplatform/apps/docsite/`) — separate ticket if Spin needs it user-facing. -- Symmetric treatment for other resource dimensions (ephemeral storage, GPUs) — not requested. From 26aae22fafd544a9044c14b8f877a074f7398bde Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 22 May 2026 13:42:41 -0300 Subject: [PATCH 43/56] docs: add changelog entry for configurable CPU and memory limits --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14165264..d8d93cdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage - Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports - Improve **wait deployment active** failure logging: consolidate repeated `Unhealthy` probe events per pod into a single human-readable line, emit a progress heartbeat every 10% of timeout, and surface a targeted suggested fix based on the probe failure mode (port not open / HTTP non-2xx / probe timeout) +- Add configurable memory and CPU limits, independent from requests, for k8s scope containers ## [1.11.0] - 2026-04-16 - Add unit testing support From 81726e1ca027e3f041b9e55818ae4c53b88d4b76 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 22 May 2026 16:24:31 -0300 Subject: [PATCH 44/56] refactor: drop ticket id and noise from normalize_capability_limits comment --- k8s/deployment/build_context | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/k8s/deployment/build_context b/k8s/deployment/build_context index c89c7928..aedd2b6f 100755 --- a/k8s/deployment/build_context +++ b/k8s/deployment/build_context @@ -24,7 +24,9 @@ MIN_REPLICAS=$(echo "$MIN_REPLICAS" | awk '{printf "%d", ($1 == int($1) ? $1 : i DEPLOYMENT_STATUS=$(echo "$CONTEXT" | jq -r ".deployment.status") # Fill in *_limit capability fields with the corresponding request value when -# the limit is missing or explicitly null. Idempotent. CLIEN-781. +# the limit is missing or explicitly null, so downstream templates can render +# resources.limits unconditionally and stay back-compat for scopes that never +# had these fields. normalize_capability_limits() { echo "$1" | jq ' .scope.capabilities.cpu_millicores_limit //= .scope.capabilities.cpu_millicores From 0a3ab0fff6353f199c984eaf0457b0f047976518 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 22 May 2026 16:24:31 -0300 Subject: [PATCH 45/56] test: exercise normalize via full build_context instead of private function --- k8s/deployment/tests/build_context.bats | 85 ++++++++++++++----------- 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/k8s/deployment/tests/build_context.bats b/k8s/deployment/tests/build_context.bats index 0f035d6c..ef54509a 100644 --- a/k8s/deployment/tests/build_context.bats +++ b/k8s/deployment/tests/build_context.bats @@ -948,54 +948,63 @@ set_additional_ports() { } # ============================================================================= -# normalize_capability_limits Function Tests (CLIEN-781) -# Fills in *_limit with the corresponding request value when null or missing, -# leaves explicit values untouched. +# Capability limits normalization +# These tests source the real deployment/build_context and assert on the +# resulting CONTEXT, exercising the full pipeline. Limits default to their +# corresponding request value when missing or explicitly null; explicit values +# pass through. # ============================================================================= -setup_normalize_limits_fn() { - eval "$(sed -n '/^normalize_capability_limits()/,/^}/p' "$PROJECT_ROOT/k8s/deployment/build_context")" +# Patches CONTEXT.scope.capabilities with the given JSON fragment (merged into +# the existing capabilities object). +set_capabilities() { + CONTEXT=$(echo "$CONTEXT" | jq --argjson v "$1" '.scope.capabilities = (.scope.capabilities + $v)') } -@test "normalize_capability_limits: fills CPU limit from request when limit is absent" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024,"ram_memory_limit":2048}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" +@test "capability limits: cpu limit defaults to cpu_millicores when absent" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"ram_memory":1024,"ram_memory_limit":2048}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" } -@test "normalize_capability_limits: fills RAM limit from request when limit is absent" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":1000,"ram_memory":1024}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +@test "capability limits: ram limit defaults to ram_memory when absent" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":1000,"ram_memory":1024}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" } -@test "normalize_capability_limits: fills both limits when both are absent" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"ram_memory":1024}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +@test "capability limits: both limits default to their requests when both absent" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"ram_memory":1024}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" } -@test "normalize_capability_limits: fills both limits when both are explicit null" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":null,"ram_memory":1024,"ram_memory_limit":null}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +@test "capability limits: explicit null limits fall back to their requests" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":null,"ram_memory":1024,"ram_memory_limit":null}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" } -@test "normalize_capability_limits: preserves explicit non-null limits" { - setup_normalize_limits_fn - local in='{"scope":{"capabilities":{"cpu_millicores":500,"cpu_millicores_limit":2000,"ram_memory":1024,"ram_memory_limit":4096}}}' - local out - out=$(normalize_capability_limits "$in") - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.cpu_millicores_limit')" "2000" - assert_equal "$(echo "$out" | jq -r '.scope.capabilities.ram_memory_limit')" "4096" +@test "capability limits: explicit non-null limits pass through unchanged" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":2000,"ram_memory":1024,"ram_memory_limit":4096}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "2000" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "4096" } From 811b607834bd7de1296e3fafb9a3731e705c8ef6 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 22 May 2026 16:27:01 -0300 Subject: [PATCH 46/56] test: remove deployment template shape tests in favor of integration coverage --- .../tests/deployment_template_shape.bats | 54 ------------------- 1 file changed, 54 deletions(-) delete mode 100644 k8s/deployment/tests/deployment_template_shape.bats diff --git a/k8s/deployment/tests/deployment_template_shape.bats b/k8s/deployment/tests/deployment_template_shape.bats deleted file mode 100644 index 6f44ffba..00000000 --- a/k8s/deployment/tests/deployment_template_shape.bats +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bats -# ============================================================================= -# Structural tests for the deployment template. -# Verifies the application container's resources block uses the right -# capability for request vs limit. CLIEN-781. -# ============================================================================= - -setup() { - export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" - source "$PROJECT_ROOT/testing/assertions.sh" - export TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/deployment.yaml.tpl" -} - -# Slice the file from "name: application" up to the application container's -# terminationMessagePolicy, isolating it from the sidecars (which keep using -# container_cpu_in_millicores / container_memory_in_memory). -app_container_block() { - awk ' - /^[[:space:]]+- name: application[[:space:]]*$/ { in_app=1 } - in_app { print } - /^[[:space:]]+terminationMessagePolicy:/ && in_app { exit } - ' "$TEMPLATE" -} - -# Everything BEFORE the application container — the sidecar definitions. -sidecars_block() { - awk '/^[[:space:]]+- name: application[[:space:]]*$/ {exit} {print}' "$TEMPLATE" -} - -@test "deployment template: application container limits.cpu uses cpu_millicores_limit" { - grep -qE 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores_limit[[:space:]]*\}\}m' <<<"$(app_container_block)" -} - -@test "deployment template: application container limits.memory uses ram_memory_limit" { - grep -qE 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory_limit[[:space:]]*\}\}Mi' <<<"$(app_container_block)" -} - -@test "deployment template: application container requests.cpu still uses cpu_millicores" { - grep -qE 'cpu:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.cpu_millicores[[:space:]]*\}\}m' <<<"$(app_container_block)" -} - -@test "deployment template: application container requests.memory still uses ram_memory" { - grep -qE 'memory:[[:space:]]*\{\{[[:space:]]*\.scope\.capabilities\.ram_memory[[:space:]]*\}\}Mi' <<<"$(app_container_block)" -} - -@test "deployment template: sidecars still use container_cpu_in_millicores / container_memory_in_memory" { - local sidecars - sidecars=$(sidecars_block) - grep -qF '{{ .container_cpu_in_millicores }}m' <<<"$sidecars" - grep -qF '{{ .container_memory_in_memory }}Mi' <<<"$sidecars" - # And sidecars must NOT have been switched to the new fields. - ! grep -qF 'cpu_millicores_limit' <<<"$sidecars" - ! grep -qF 'ram_memory_limit' <<<"$sidecars" -} From 44776c7889c38e0d23f32cf1579b33dfdc1875be Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Fri, 22 May 2026 16:38:06 -0300 Subject: [PATCH 47/56] feat: clamp limit to request when below it as defense-in-depth --- k8s/deployment/build_context | 9 +++++--- k8s/deployment/tests/build_context.bats | 30 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/k8s/deployment/build_context b/k8s/deployment/build_context index aedd2b6f..31880dc7 100755 --- a/k8s/deployment/build_context +++ b/k8s/deployment/build_context @@ -24,13 +24,16 @@ MIN_REPLICAS=$(echo "$MIN_REPLICAS" | awk '{printf "%d", ($1 == int($1) ? $1 : i DEPLOYMENT_STATUS=$(echo "$CONTEXT" | jq -r ".deployment.status") # Fill in *_limit capability fields with the corresponding request value when -# the limit is missing or explicitly null, so downstream templates can render -# resources.limits unconditionally and stay back-compat for scopes that never -# had these fields. +# the limit is missing or explicitly null, then clamp any limit below its +# request up to the request value. The schema rejects limit < request at save +# time; this is defense-in-depth so the script can never produce an invalid +# resources block, regardless of how the context was built. normalize_capability_limits() { echo "$1" | jq ' .scope.capabilities.cpu_millicores_limit //= .scope.capabilities.cpu_millicores | .scope.capabilities.ram_memory_limit //= .scope.capabilities.ram_memory + | .scope.capabilities.cpu_millicores_limit = ([.scope.capabilities.cpu_millicores, .scope.capabilities.cpu_millicores_limit] | max) + | .scope.capabilities.ram_memory_limit = ([.scope.capabilities.ram_memory, .scope.capabilities.ram_memory_limit] | max) ' } diff --git a/k8s/deployment/tests/build_context.bats b/k8s/deployment/tests/build_context.bats index ef54509a..72d9d020 100644 --- a/k8s/deployment/tests/build_context.bats +++ b/k8s/deployment/tests/build_context.bats @@ -1008,3 +1008,33 @@ set_capabilities() { assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "2000" assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "4096" } + +@test "capability limits: cpu limit below request is clamped up to request" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":100,"ram_memory":1024,"ram_memory_limit":2048}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "2048" +} + +@test "capability limits: ram limit below request is clamped up to request" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":1000,"ram_memory":1024,"ram_memory_limit":64}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "1000" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "capability limits: both limits below their requests are clamped up" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":100,"ram_memory":1024,"ram_memory_limit":64}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} From fd3912757e713c3dd8d3ca78db20927330dc2f20 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 12:01:02 -0300 Subject: [PATCH 48/56] fix(k8s,scheduled_task): file-type parameter no longer leaks binary as env var --- CHANGELOG.md | 1 + k8s/deployment/templates/deployment.yaml.tpl | 2 +- k8s/deployment/templates/secret.yaml.tpl | 3 +- k8s/deployment/tests/build_deployment.bats | 106 ++++++++++++++++++ .../deployment/templates/deployment.yaml.tpl | 2 +- 5 files changed, 111 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14165264..0248fca4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Fix deployment failure for `file`-type parameters with binary content (e.g., P12 certificates): the env var injected from the deployment Secret now carries the file's destination path instead of the raw content, avoiding `invalid environment variable ... contains nul byte` errors. The file is still mounted at `destination_path` as before. - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage - Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports - Improve **wait deployment active** failure logging: consolidate repeated `Unhealthy` probe events per pod into a single human-readable line, emit a progress heartbeat every 10% of timeout, and surface a targeted suggested fix based on the probe failure mode (port not open / HTTP non-2xx / probe timeout) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 3552c483..74ae8ed5 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -377,7 +377,7 @@ spec: secret: secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }} items: - - key: {{ printf "app-data-%s" (filepath.Base .destination_path) }} + - key: {{ printf "app-file-%s" (filepath.Base .destination_path) }} path: {{ filepath.Base .destination_path }} {{- end }} {{- end }} diff --git a/k8s/deployment/templates/secret.yaml.tpl b/k8s/deployment/templates/secret.yaml.tpl index baa9564d..13a6d8b4 100644 --- a/k8s/deployment/templates/secret.yaml.tpl +++ b/k8s/deployment/templates/secret.yaml.tpl @@ -39,7 +39,8 @@ data: {{- end }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - {{ printf "app-data-%s" (filepath.Base .destination_path) }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} + {{ printf "app-data-%s" (filepath.Base .destination_path) }}: {{ .destination_path | base64.Encode }} + {{ printf "app-file-%s" (filepath.Base .destination_path) }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} {{- end }} {{- end }} {{- end }} diff --git a/k8s/deployment/tests/build_deployment.bats b/k8s/deployment/tests/build_deployment.bats index f010afce..585c8f40 100644 --- a/k8s/deployment/tests/build_deployment.bats +++ b/k8s/deployment/tests/build_deployment.bats @@ -170,3 +170,109 @@ teardown() { [ "$status" -eq 0 ] [ ! -f "$OUTPUT_DIR/context-scope-123.json" ] } + +# ============================================================================= +# Rendering Tests — real gomplate, assert on rendered output +# ============================================================================= +# These tests run the actual `gomplate` binary against the templates and +# verify the rendered Secret + Deployment YAML have the right shape. +# +# Regression guard for the file-type parameter bug: binary file content used +# to be stored under Secret key `app-data-` and then leaked into the +# container env block via `envFrom`, which runc rejects with +# `invalid environment variable ... contains nul byte`. The fix splits the +# Secret key into: +# - app-data- -> destination_path (string, env-safe) +# - app-file- -> raw binary (volume-mount-only) +# and updates the volume mount to read from the new key. + +# Minimal context that satisfies all five templates' required fields. +# Includes both an `environment` and a `file` parameter so we can assert on +# the file-specific keys without ignoring the rest of the Secret content. +_render_context() { + cat <<'JSON' +{ + "account": {"id": "acc1", "slug": "acct"}, + "namespace": {"id": "ns1", "slug": "nsps"}, + "application": {"id": "app1", "slug": "appslug"}, + "release": {"semver": "1.0.0"}, + "scope": { + "id": "scope-123", + "slug": "scopeslug", + "domain": "x.example.com", + "dimensions": {"env": "dev"}, + "capabilities": { + "cpu_millicores": 100, + "ram_memory": 128, + "additional_ports": [], + "scaling_type": "fixed", + "autoscaling": { + "min_replicas": 1, + "max_replicas": 3, + "target_cpu_utilization": 80, + "target_memory_enabled": false, + "target_memory_utilization": 80 + }, + "health_check": {"path": "/health", "timeout_seconds": 1, "period_seconds": 5, "initial_delay_seconds": 5} + } + }, + "deployment": {"id": "deploy-456"}, + "k8s_namespace": "ns-test", + "k8s_modifiers": {}, + "asset": {"url": "example.com/app:latest"}, + "main_http_port": 8080, + "traffic_image": "example.com/traffic:latest", + "container_cpu_in_millicores": 50, + "container_memory_in_memory": 64, + "pull_secrets": {"ENABLED": false, "SECRETS": []}, + "region": "us-east-1", + "component": "app", + "service_account_name": "", + "traffic_manager_config_map": "", + "pdb_enabled": "false", + "pdb_max_unavailable": "25%", + "parameters": { + "results": [ + {"type": "environment", "variable": "MY_VAR", "values": [{"value": "hello"}]}, + {"type": "file", "destination_path": "/etc/certs/test.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} + ] + } +} +JSON +} + +@test "build_deployment: file-type parameter renders path env var and separate binary key" { + unset -f gomplate # use the real gomplate binary, not the setup mock + + export CONTEXT="$(_render_context)" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + local secret_file="$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" + local deploy_file="$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" + + assert_file_exists "$secret_file" + assert_file_exists "$deploy_file" + + # Secret: app-data- holds the base64-encoded destination path, + # so envFrom injects a NUL-byte-free env var. + local expected_path_b64 + expected_path_b64=$(printf '%s' '/etc/certs/test.p12' | base64) + assert_contains "$(cat "$secret_file")" "app-data-test.p12: ${expected_path_b64}" + + # Secret: app-file- holds the raw base64 binary content for the + # volume mount. + assert_contains "$(cat "$secret_file")" "app-file-test.p12: QUFBQkJC" + + # Regression guard: the app-data key MUST NEVER carry the raw binary + # (that's the original bug — runc rejects NUL bytes in env vars). + ! grep -E '^[[:space:]]*app-data-test\.p12:[[:space:]]+QUFBQkJC[[:space:]]*$' "$secret_file" + + # Deployment: the volume mount items reference the binary key. + assert_contains "$(cat "$deploy_file")" "key: app-file-test.p12" + + # Regression guard: the volume mount must not read from the env-var key, + # otherwise the materialized file would contain the path string, not the cert. + ! grep -F 'key: app-data-test.p12' "$deploy_file" +} diff --git a/scheduled_task/deployment/templates/deployment.yaml.tpl b/scheduled_task/deployment/templates/deployment.yaml.tpl index b5c677d6..926640ea 100644 --- a/scheduled_task/deployment/templates/deployment.yaml.tpl +++ b/scheduled_task/deployment/templates/deployment.yaml.tpl @@ -170,7 +170,7 @@ spec: secret: secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }} items: - - key: {{ printf "app-data-%s" (filepath.Base .destination_path) }} + - key: {{ printf "app-file-%s" (filepath.Base .destination_path) }} path: {{ filepath.Base .destination_path }} {{- end }} {{- end }} From f6118cec2e59d5d555b16587635b865fc2b06d48 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 12:10:59 -0300 Subject: [PATCH 49/56] docs(changelog): tighten file-parameter fix entry --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0248fca4..613e124b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -- Fix deployment failure for `file`-type parameters with binary content (e.g., P12 certificates): the env var injected from the deployment Secret now carries the file's destination path instead of the raw content, avoiding `invalid environment variable ... contains nul byte` errors. The file is still mounted at `destination_path` as before. +- Fix: do not inject file parameter as env vars - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage - Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports - Improve **wait deployment active** failure logging: consolidate repeated `Unhealthy` probe events per pod into a single human-readable line, emit a progress heartbeat every 10% of timeout, and surface a targeted suggested fix based on the probe failure mode (port not open / HTTP non-2xx / probe timeout) From 4c57d1ebbb768db14909626e33e3e8c85f81d2fb Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 13:57:38 -0300 Subject: [PATCH 50/56] fix(k8s,scheduled_task): isolate file binary in a dedicated Secret --- k8s/deployment/build_deployment | 13 ++++ k8s/deployment/templates/deployment.yaml.tpl | 11 ++- .../templates/secret-files.yaml.tpl | 48 ++++++++++++ k8s/deployment/templates/secret.yaml.tpl | 6 -- k8s/deployment/tests/build_deployment.bats | 76 +++++++++++++------ k8s/values.yaml | 1 + scheduled_task/deployment/build_deployment | 14 ++++ .../deployment/templates/deployment.yaml.tpl | 11 ++- 8 files changed, 148 insertions(+), 32 deletions(-) create mode 100644 k8s/deployment/templates/secret-files.yaml.tpl diff --git a/k8s/deployment/build_deployment b/k8s/deployment/build_deployment index a51bf971..6a333b8e 100755 --- a/k8s/deployment/build_deployment +++ b/k8s/deployment/build_deployment @@ -3,6 +3,7 @@ DEPLOYMENT_PATH="$OUTPUT_DIR/deployment-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SECRET_PATH="$OUTPUT_DIR/secret-$SCOPE_ID-$DEPLOYMENT_ID.yaml" +SECRET_FILES_PATH="$OUTPUT_DIR/secret-files-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SCALING_PATH="$OUTPUT_DIR/scaling-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SERVICE_TEMPLATE_PATH="$OUTPUT_DIR/service-$SCOPE_ID-$DEPLOYMENT_ID.yaml" PDB_PATH="$OUTPUT_DIR/pdb-$SCOPE_ID-$DEPLOYMENT_ID.yaml" @@ -38,6 +39,18 @@ if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then fi log info " ✅ Secret template: $SECRET_PATH" +gomplate -c .="$CONTEXT_PATH" \ + --file "$SECRET_FILES_TEMPLATE" \ + --out "$SECRET_FILES_PATH" + +TEMPLATE_GENERATION_STATUS=$? + +if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then + log error " ❌ Failed to build secret-files template" + exit 1 +fi +log info " ✅ Secret-files template: $SECRET_FILES_PATH" + gomplate -c .="$CONTEXT_PATH" \ --file "$SCALING_TEMPLATE" \ --out "$SCALING_PATH" diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 74ae8ed5..2816940c 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -295,6 +295,15 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} + {{- if .parameters.results }} + env: + {{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + - name: {{ printf "app-data-%s" (filepath.Base .destination_path) }} + value: {{ .destination_path | quote }} + {{- end }} + {{- end }} + {{- end }} image: >- {{ .asset.url }} securityContext: @@ -375,7 +384,7 @@ spec: {{- if gt (len .values) 0 }} - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-" | strings.ReplaceAll "_" "-") }} secret: - secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }} + secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - key: {{ printf "app-file-%s" (filepath.Base .destination_path) }} path: {{ filepath.Base .destination_path }} diff --git a/k8s/deployment/templates/secret-files.yaml.tpl b/k8s/deployment/templates/secret-files.yaml.tpl new file mode 100644 index 00000000..ea602cba --- /dev/null +++ b/k8s/deployment/templates/secret-files.yaml.tpl @@ -0,0 +1,48 @@ +{{- $hasFile := false -}} +{{- if .parameters.results -}} + {{- range .parameters.results -}} + {{- if and (eq .type "file") (gt (len .values) 0) -}} + {{- $hasFile = true -}} + {{- end -}} + {{- end -}} +{{- end -}} +{{- if $hasFile -}} +apiVersion: v1 +kind: Secret +immutable: true +metadata: + name: s-{{ .scope.id }}-d-{{ .deployment.id }}-files + namespace: {{ .k8s_namespace }} + labels: + nullplatform: "true" + account: {{ .account.slug }} + account_id: "{{ .account.id }}" + namespace: {{ .namespace.slug }} + namespace_id: "{{ .namespace.id }}" + application: {{ .application.slug }} + application_id: "{{ .application.id }}" + scope: {{ .scope.slug }} + scope_id: "{{ .scope.id }}" + deployment_id: "{{ .deployment.id }}" +{{- $global := index .k8s_modifiers "global" }} +{{- if $global }} + {{- $labels := index $global "labels" }} + {{- if $labels }} +{{ data.ToYAML $labels | indent 4 }} + {{- end }} +{{- end }} +{{- $secret := index .k8s_modifiers "secret" }} +{{- if $secret }} + {{- $labels := index $secret "labels" }} + {{- if $labels }} +{{ data.ToYAML $labels | indent 4 }} + {{- end }} +{{- end }} +data: +{{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + {{ printf "app-file-%s" (filepath.Base .destination_path) }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} + {{- end }} +{{- end }} +type: Opaque +{{- end -}} diff --git a/k8s/deployment/templates/secret.yaml.tpl b/k8s/deployment/templates/secret.yaml.tpl index 13a6d8b4..59028c66 100644 --- a/k8s/deployment/templates/secret.yaml.tpl +++ b/k8s/deployment/templates/secret.yaml.tpl @@ -37,12 +37,6 @@ data: {{ .variable }}: {{ index .values 0 "value" | base64.Encode }} {{- end }} {{- end }} - {{- if and (eq .type "file") }} - {{- if gt (len .values) 0 }} - {{ printf "app-data-%s" (filepath.Base .destination_path) }}: {{ .destination_path | base64.Encode }} - {{ printf "app-file-%s" (filepath.Base .destination_path) }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} - {{- end }} - {{- end }} {{- end }} {{- end }} NP_ACCOUNT: {{ .account.slug | base64.Encode }} diff --git a/k8s/deployment/tests/build_deployment.bats b/k8s/deployment/tests/build_deployment.bats index 585c8f40..1042e572 100644 --- a/k8s/deployment/tests/build_deployment.bats +++ b/k8s/deployment/tests/build_deployment.bats @@ -18,6 +18,7 @@ setup() { # Template paths export DEPLOYMENT_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/deployment.yaml.tpl" export SECRET_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret.yaml.tpl" + export SECRET_FILES_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret-files.yaml.tpl" export SCALING_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/scaling.yaml.tpl" export SERVICE_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/service.yaml.tpl" export PDB_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/pdb.yaml.tpl" @@ -143,6 +144,13 @@ teardown() { assert_file_exists "$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" } +@test "build_deployment: creates secret-files file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" +} + @test "build_deployment: creates scaling file with correct name" { run bash "$BATS_TEST_DIRNAME/../build_deployment" @@ -178,13 +186,14 @@ teardown() { # verify the rendered Secret + Deployment YAML have the right shape. # # Regression guard for the file-type parameter bug: binary file content used -# to be stored under Secret key `app-data-` and then leaked into the -# container env block via `envFrom`, which runc rejects with -# `invalid environment variable ... contains nul byte`. The fix splits the -# Secret key into: -# - app-data- -> destination_path (string, env-safe) -# - app-file- -> raw binary (volume-mount-only) -# and updates the volume mount to read from the new key. +# to be stored under Secret key `app-data-` in the env-var Secret, +# which then leaked into the container env block via `envFrom`, which runc +# rejects with `invalid environment variable ... contains nul byte`. The fix +# splits the storage into two Secrets: +# - s--d- env-only, consumed via envFrom (safe) +# - s--d--files binary-only, consumed only by the volume mount +# Plus a plain `env:` entry on the application container that carries the +# file's destination path under name `app-data-`. # Minimal context that satisfies all five templates' required fields. # Includes both an `environment` and a `file` parameter so we can assert on @@ -241,7 +250,7 @@ _render_context() { JSON } -@test "build_deployment: file-type parameter renders path env var and separate binary key" { +@test "build_deployment: file-type parameter splits binary into a separate Secret" { unset -f gomplate # use the real gomplate binary, not the setup mock export CONTEXT="$(_render_context)" @@ -250,29 +259,48 @@ JSON [ "$status" -eq 0 ] local secret_file="$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" local deploy_file="$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" assert_file_exists "$secret_file" + assert_file_exists "$secret_files_file" assert_file_exists "$deploy_file" - # Secret: app-data- holds the base64-encoded destination path, - # so envFrom injects a NUL-byte-free env var. - local expected_path_b64 - expected_path_b64=$(printf '%s' '/etc/certs/test.p12' | base64) - assert_contains "$(cat "$secret_file")" "app-data-test.p12: ${expected_path_b64}" + # The env-var Secret MUST NOT contain anything that pulls in binary content + # via envFrom. Both app-data-* and app-file-* keys are forbidden here. + ! grep -E 'app-(data|file)-test\.p12' "$secret_file" + + # The files Secret carries only the binary content, named so the volume mount + # can reference it. The Secret is in a separate object so `envFrom` on the + # env-var Secret cannot reach these bytes. + assert_contains "$(cat "$secret_files_file")" "name: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$secret_files_file")" "app-file-test.p12: QUFBQkJC" + ! grep -E 'app-data-test\.p12' "$secret_files_file" + + # The deployment exposes the destination path to the app via a plain `env:` + # entry on the application container (not via any Secret) — no NUL bytes. + assert_contains "$(cat "$deploy_file")" "- name: app-data-test.p12" + assert_contains "$(cat "$deploy_file")" 'value: "/etc/certs/test.p12"' + + # The volume mount reads bytes from the files Secret, with key matching the + # one produced by secret-files.yaml.tpl. + assert_contains "$(cat "$deploy_file")" "secretName: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$deploy_file")" "key: app-file-test.p12" +} - # Secret: app-file- holds the raw base64 binary content for the - # volume mount. - assert_contains "$(cat "$secret_file")" "app-file-test.p12: QUFBQkJC" +@test "build_deployment: secret-files renders empty when no file params" { + unset -f gomplate - # Regression guard: the app-data key MUST NEVER carry the raw binary - # (that's the original bug — runc rejects NUL bytes in env vars). - ! grep -E '^[[:space:]]*app-data-test\.p12:[[:space:]]+QUFBQkJC[[:space:]]*$' "$secret_file" + # Same context as _render_context but with the file-type param removed. + export CONTEXT="$(_render_context | jq '.parameters.results |= map(select(.type != "file"))')" - # Deployment: the volume mount items reference the binary key. - assert_contains "$(cat "$deploy_file")" "key: app-file-test.p12" + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] - # Regression guard: the volume mount must not read from the env-var key, - # otherwise the materialized file would contain the path string, not the cert. - ! grep -F 'key: app-data-test.p12' "$deploy_file" + # gomplate skips writing the output file when the template renders empty, + # which is the signal to apply_templates (which iterates the OUTPUT_DIR and + # skips zero-byte/missing files) to not create an empty files-Secret in the + # cluster. + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" + [ ! -f "$secret_files_file" ] || [ ! -s "$secret_files_file" ] } diff --git a/k8s/values.yaml b/k8s/values.yaml index d053bc0a..97c68ce6 100644 --- a/k8s/values.yaml +++ b/k8s/values.yaml @@ -22,6 +22,7 @@ configuration: DEPLOYMENT_MAX_WAIT_IN_SECONDS: 600 DEPLOYMENT_TEMPLATE: "$SERVICE_PATH/deployment/templates/deployment.yaml.tpl" SECRET_TEMPLATE: "$SERVICE_PATH/deployment/templates/secret.yaml.tpl" + SECRET_FILES_TEMPLATE: "$SERVICE_PATH/deployment/templates/secret-files.yaml.tpl" SCALING_TEMPLATE: "$SERVICE_PATH/deployment/templates/scaling.yaml.tpl" SERVICE_TEMPLATE: "$SERVICE_PATH/deployment/templates/service.yaml.tpl" PDB_TEMPLATE: "$SERVICE_PATH/deployment/templates/pdb.yaml.tpl" diff --git a/scheduled_task/deployment/build_deployment b/scheduled_task/deployment/build_deployment index a39f925e..c64f5780 100644 --- a/scheduled_task/deployment/build_deployment +++ b/scheduled_task/deployment/build_deployment @@ -2,6 +2,7 @@ DEPLOYMENT_PATH="$OUTPUT_DIR/deployment-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SECRET_PATH="$OUTPUT_DIR/secret-$SCOPE_ID-$DEPLOYMENT_ID.yaml" +SECRET_FILES_PATH="$OUTPUT_DIR/secret-files-$SCOPE_ID-$DEPLOYMENT_ID.yaml" CONTEXT_PATH="$OUTPUT_DIR/context-$SCOPE_ID.json" echo "$CONTEXT" | jq --arg replicas "$REPLICAS" '. + {replicas: $replicas}' > "$CONTEXT_PATH" @@ -32,4 +33,17 @@ if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then exit 1 fi +echo "Building Template: $SECRET_FILES_TEMPLATE to $SECRET_FILES_PATH" + +gomplate -c .="$CONTEXT_PATH" \ + --file "$SECRET_FILES_TEMPLATE" \ + --out "$SECRET_FILES_PATH" + +TEMPLATE_GENERATION_STATUS=$? + +if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then + echo "Error building secret-files template" + exit 1 +fi + rm "$CONTEXT_PATH" diff --git a/scheduled_task/deployment/templates/deployment.yaml.tpl b/scheduled_task/deployment/templates/deployment.yaml.tpl index 926640ea..3a8e5712 100644 --- a/scheduled_task/deployment/templates/deployment.yaml.tpl +++ b/scheduled_task/deployment/templates/deployment.yaml.tpl @@ -139,6 +139,15 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} + {{- if .parameters.results }} + env: + {{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + - name: {{ printf "app-data-%s" (filepath.Base .destination_path) }} + value: {{ .destination_path | quote }} + {{- end }} + {{- end }} + {{- end }} image: {{ .asset.url }} resources: limits: @@ -168,7 +177,7 @@ spec: {{- if gt (len .values) 0 }} - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-") }} secret: - secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }} + secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - key: {{ printf "app-file-%s" (filepath.Base .destination_path) }} path: {{ filepath.Base .destination_path }} From 8a093eb8aaf70daeb2dafa04f8c5c5f7414ebce9 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 14:20:27 -0300 Subject: [PATCH 51/56] refactor(k8s,scheduled_task): derive file-param identifiers from .name --- k8s/deployment/templates/deployment.yaml.tpl | 11 ++++++---- .../templates/secret-files.yaml.tpl | 3 ++- k8s/deployment/tests/build_deployment.bats | 21 ++++++++++--------- .../deployment/templates/deployment.yaml.tpl | 11 ++++++---- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 2816940c..34752aef 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -299,7 +299,8 @@ spec: env: {{- range .parameters.results }} {{- if and (eq .type "file") (gt (len .values) 0) }} - - name: {{ printf "app-data-%s" (filepath.Base .destination_path) }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "app-data-%s" $key }} value: {{ .destination_path | quote }} {{- end }} {{- end }} @@ -364,7 +365,8 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-" | strings.ReplaceAll "_" "-") }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} mountPath: {{ .destination_path }} subPath: {{ filepath.Base .destination_path }} readOnly: true @@ -382,11 +384,12 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-" | strings.ReplaceAll "_" "-") }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} secret: secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - - key: {{ printf "app-file-%s" (filepath.Base .destination_path) }} + - key: {{ printf "app-file-%s" $key }} path: {{ filepath.Base .destination_path }} {{- end }} {{- end }} diff --git a/k8s/deployment/templates/secret-files.yaml.tpl b/k8s/deployment/templates/secret-files.yaml.tpl index ea602cba..883a3f66 100644 --- a/k8s/deployment/templates/secret-files.yaml.tpl +++ b/k8s/deployment/templates/secret-files.yaml.tpl @@ -41,7 +41,8 @@ metadata: data: {{- range .parameters.results }} {{- if and (eq .type "file") (gt (len .values) 0) }} - {{ printf "app-file-%s" (filepath.Base .destination_path) }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + {{ printf "app-file-%s" $key }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} {{- end }} {{- end }} type: Opaque diff --git a/k8s/deployment/tests/build_deployment.bats b/k8s/deployment/tests/build_deployment.bats index 1042e572..2b2762f6 100644 --- a/k8s/deployment/tests/build_deployment.bats +++ b/k8s/deployment/tests/build_deployment.bats @@ -243,7 +243,7 @@ _render_context() { "parameters": { "results": [ {"type": "environment", "variable": "MY_VAR", "values": [{"value": "hello"}]}, - {"type": "file", "destination_path": "/etc/certs/test.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} + {"type": "file", "name": "API P12 Cert!", "destination_path": "/etc/certs/test.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} ] } } @@ -268,24 +268,25 @@ JSON # The env-var Secret MUST NOT contain anything that pulls in binary content # via envFrom. Both app-data-* and app-file-* keys are forbidden here. - ! grep -E 'app-(data|file)-test\.p12' "$secret_file" + ! grep -E 'app-(data|file)-' "$secret_file" - # The files Secret carries only the binary content, named so the volume mount - # can reference it. The Secret is in a separate object so `envFrom` on the - # env-var Secret cannot reach these bytes. + # Param name "API P12 Cert!" sanitizes to api-p12-cert (lowercase, runs of + # non-alphanumeric collapse to '-', leading/trailing '-' trimmed). The same + # token is reused as env name suffix, Secret data key, and volume name. assert_contains "$(cat "$secret_files_file")" "name: s-scope-123-d-deploy-456-files" - assert_contains "$(cat "$secret_files_file")" "app-file-test.p12: QUFBQkJC" - ! grep -E 'app-data-test\.p12' "$secret_files_file" + assert_contains "$(cat "$secret_files_file")" "app-file-api-p12-cert: QUFBQkJC" + ! grep -E 'app-data-' "$secret_files_file" # The deployment exposes the destination path to the app via a plain `env:` - # entry on the application container (not via any Secret) — no NUL bytes. - assert_contains "$(cat "$deploy_file")" "- name: app-data-test.p12" + # entry on the application container (not via any Secret) — no NUL bytes, + # and the env var name is derived from the parameter's display name. + assert_contains "$(cat "$deploy_file")" "- name: app-data-api-p12-cert" assert_contains "$(cat "$deploy_file")" 'value: "/etc/certs/test.p12"' # The volume mount reads bytes from the files Secret, with key matching the # one produced by secret-files.yaml.tpl. assert_contains "$(cat "$deploy_file")" "secretName: s-scope-123-d-deploy-456-files" - assert_contains "$(cat "$deploy_file")" "key: app-file-test.p12" + assert_contains "$(cat "$deploy_file")" "key: app-file-api-p12-cert" } @test "build_deployment: secret-files renders empty when no file params" { diff --git a/scheduled_task/deployment/templates/deployment.yaml.tpl b/scheduled_task/deployment/templates/deployment.yaml.tpl index 3a8e5712..14e8b123 100644 --- a/scheduled_task/deployment/templates/deployment.yaml.tpl +++ b/scheduled_task/deployment/templates/deployment.yaml.tpl @@ -143,7 +143,8 @@ spec: env: {{- range .parameters.results }} {{- if and (eq .type "file") (gt (len .values) 0) }} - - name: {{ printf "app-data-%s" (filepath.Base .destination_path) }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "app-data-%s" $key }} value: {{ .destination_path | quote }} {{- end }} {{- end }} @@ -162,7 +163,8 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-") }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} mountPath: {{ .destination_path }} subPath: {{ filepath.Base .destination_path }} readOnly: true @@ -175,11 +177,12 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-") }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} secret: secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - - key: {{ printf "app-file-%s" (filepath.Base .destination_path) }} + - key: {{ printf "app-file-%s" $key }} path: {{ filepath.Base .destination_path }} {{- end }} {{- end }} From dc679d2e4c8ef27a300a17c506237327ca324f70 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 15:03:36 -0300 Subject: [PATCH 52/56] fix(k8s,scheduled_task): omit env: block when no file params --- k8s/deployment/templates/deployment.yaml.tpl | 8 +++++++ k8s/deployment/tests/build_deployment.bats | 23 +++++++++++++++++++ .../deployment/templates/deployment.yaml.tpl | 8 +++++++ 3 files changed, 39 insertions(+) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 34752aef..ffa3d6d5 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -295,7 +295,15 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} + {{- $hasFile := false }} {{- if .parameters.results }} + {{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + {{- $hasFile = true }} + {{- end }} + {{- end }} + {{- end }} + {{- if $hasFile }} env: {{- range .parameters.results }} {{- if and (eq .type "file") (gt (len .values) 0) }} diff --git a/k8s/deployment/tests/build_deployment.bats b/k8s/deployment/tests/build_deployment.bats index 2b2762f6..f9df61f9 100644 --- a/k8s/deployment/tests/build_deployment.bats +++ b/k8s/deployment/tests/build_deployment.bats @@ -305,3 +305,26 @@ JSON local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" [ ! -f "$secret_files_file" ] || [ ! -s "$secret_files_file" ] } + +@test "build_deployment: deployment omits env: block when no file params" { + unset -f gomplate + + # Env-only param set. An empty `env:` followed by `image:` at the same indent + # is rejected by strict YAML-to-JSON converters (the deployment agent), so + # the block must not be emitted at all when there are no file params. + export CONTEXT="$(_render_context | jq '.parameters.results |= map(select(.type != "file"))')" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + local deploy_file="$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" + assert_file_exists "$deploy_file" + + # Slice the application container block (from its `- name: application` + # header up to the next sibling `- name:` or `restartPolicy`) and assert + # no `env:` key appears inside it. The traffic-manager sidecar also has + # `env:`, so a file-wide grep would false-positive. + local app_block + app_block=$(awk '/^ - name: application$/{flag=1} flag && /^ restartPolicy:/{flag=0} flag' "$deploy_file") + ! grep -qE '^ env:' <<< "$app_block" +} diff --git a/scheduled_task/deployment/templates/deployment.yaml.tpl b/scheduled_task/deployment/templates/deployment.yaml.tpl index 14e8b123..8ec86e3c 100644 --- a/scheduled_task/deployment/templates/deployment.yaml.tpl +++ b/scheduled_task/deployment/templates/deployment.yaml.tpl @@ -139,7 +139,15 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} + {{- $hasFile := false }} {{- if .parameters.results }} + {{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + {{- $hasFile = true }} + {{- end }} + {{- end }} + {{- end }} + {{- if $hasFile }} env: {{- range .parameters.results }} {{- if and (eq .type "file") (gt (len .values) 0) }} From cd89c5f3528ff1e98e39ad8856656909e50f142d Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 15:05:41 -0300 Subject: [PATCH 53/56] Revert "fix(k8s,scheduled_task): omit env: block when no file params" This reverts commit dc679d2e4c8ef27a300a17c506237327ca324f70. --- k8s/deployment/templates/deployment.yaml.tpl | 8 ------- k8s/deployment/tests/build_deployment.bats | 23 ------------------- .../deployment/templates/deployment.yaml.tpl | 8 ------- 3 files changed, 39 deletions(-) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index ffa3d6d5..34752aef 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -295,15 +295,7 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} - {{- $hasFile := false }} {{- if .parameters.results }} - {{- range .parameters.results }} - {{- if and (eq .type "file") (gt (len .values) 0) }} - {{- $hasFile = true }} - {{- end }} - {{- end }} - {{- end }} - {{- if $hasFile }} env: {{- range .parameters.results }} {{- if and (eq .type "file") (gt (len .values) 0) }} diff --git a/k8s/deployment/tests/build_deployment.bats b/k8s/deployment/tests/build_deployment.bats index f9df61f9..2b2762f6 100644 --- a/k8s/deployment/tests/build_deployment.bats +++ b/k8s/deployment/tests/build_deployment.bats @@ -305,26 +305,3 @@ JSON local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" [ ! -f "$secret_files_file" ] || [ ! -s "$secret_files_file" ] } - -@test "build_deployment: deployment omits env: block when no file params" { - unset -f gomplate - - # Env-only param set. An empty `env:` followed by `image:` at the same indent - # is rejected by strict YAML-to-JSON converters (the deployment agent), so - # the block must not be emitted at all when there are no file params. - export CONTEXT="$(_render_context | jq '.parameters.results |= map(select(.type != "file"))')" - - run bash "$BATS_TEST_DIRNAME/../build_deployment" - [ "$status" -eq 0 ] - - local deploy_file="$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" - assert_file_exists "$deploy_file" - - # Slice the application container block (from its `- name: application` - # header up to the next sibling `- name:` or `restartPolicy`) and assert - # no `env:` key appears inside it. The traffic-manager sidecar also has - # `env:`, so a file-wide grep would false-positive. - local app_block - app_block=$(awk '/^ - name: application$/{flag=1} flag && /^ restartPolicy:/{flag=0} flag' "$deploy_file") - ! grep -qE '^ env:' <<< "$app_block" -} diff --git a/scheduled_task/deployment/templates/deployment.yaml.tpl b/scheduled_task/deployment/templates/deployment.yaml.tpl index 8ec86e3c..14e8b123 100644 --- a/scheduled_task/deployment/templates/deployment.yaml.tpl +++ b/scheduled_task/deployment/templates/deployment.yaml.tpl @@ -139,15 +139,7 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} - {{- $hasFile := false }} {{- if .parameters.results }} - {{- range .parameters.results }} - {{- if and (eq .type "file") (gt (len .values) 0) }} - {{- $hasFile = true }} - {{- end }} - {{- end }} - {{- end }} - {{- if $hasFile }} env: {{- range .parameters.results }} {{- if and (eq .type "file") (gt (len .values) 0) }} From b994dfa89a332e38fec2b33b546d75060f7d903e Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 15:43:38 -0300 Subject: [PATCH 54/56] fix(k8s,scheduled_task): quote destination_path in YAML to escape flow chars --- k8s/deployment/templates/deployment.yaml.tpl | 6 +++--- k8s/deployment/tests/build_deployment.bats | 10 ++++++++-- .../deployment/templates/deployment.yaml.tpl | 6 +++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 34752aef..e8313a92 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -367,8 +367,8 @@ spec: {{- if gt (len .values) 0 }} {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} - name: {{ printf "file-%s" $key }} - mountPath: {{ .destination_path }} - subPath: {{ filepath.Base .destination_path }} + mountPath: {{ .destination_path | quote }} + subPath: {{ filepath.Base .destination_path | quote }} readOnly: true {{- end }} {{- end }} @@ -390,7 +390,7 @@ spec: secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - key: {{ printf "app-file-%s" $key }} - path: {{ filepath.Base .destination_path }} + path: {{ filepath.Base .destination_path | quote }} {{- end }} {{- end }} {{- end }} diff --git a/k8s/deployment/tests/build_deployment.bats b/k8s/deployment/tests/build_deployment.bats index 2b2762f6..41adfe2a 100644 --- a/k8s/deployment/tests/build_deployment.bats +++ b/k8s/deployment/tests/build_deployment.bats @@ -243,7 +243,7 @@ _render_context() { "parameters": { "results": [ {"type": "environment", "variable": "MY_VAR", "values": [{"value": "hello"}]}, - {"type": "file", "name": "API P12 Cert!", "destination_path": "/etc/certs/test.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} + {"type": "file", "name": "API P12 Cert!", "destination_path": "/app-data/[2026-05-27] cert.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} ] } } @@ -281,7 +281,13 @@ JSON # entry on the application container (not via any Secret) — no NUL bytes, # and the env var name is derived from the parameter's display name. assert_contains "$(cat "$deploy_file")" "- name: app-data-api-p12-cert" - assert_contains "$(cat "$deploy_file")" 'value: "/etc/certs/test.p12"' + # The path starts with `[`, which YAML parses as a flow sequence unless the + # value is quoted. mountPath, subPath, path, and the env value must all be + # quoted; otherwise the deployment agent fails with `did not find expected key`. + assert_contains "$(cat "$deploy_file")" 'value: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'mountPath: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'subPath: "[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'path: "[2026-05-27] cert.p12"' # The volume mount reads bytes from the files Secret, with key matching the # one produced by secret-files.yaml.tpl. diff --git a/scheduled_task/deployment/templates/deployment.yaml.tpl b/scheduled_task/deployment/templates/deployment.yaml.tpl index 14e8b123..a1d9f1f1 100644 --- a/scheduled_task/deployment/templates/deployment.yaml.tpl +++ b/scheduled_task/deployment/templates/deployment.yaml.tpl @@ -165,8 +165,8 @@ spec: {{- if gt (len .values) 0 }} {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} - name: {{ printf "file-%s" $key }} - mountPath: {{ .destination_path }} - subPath: {{ filepath.Base .destination_path }} + mountPath: {{ .destination_path | quote }} + subPath: {{ filepath.Base .destination_path | quote }} readOnly: true {{- end }} {{- end }} @@ -183,7 +183,7 @@ spec: secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - key: {{ printf "app-file-%s" $key }} - path: {{ filepath.Base .destination_path }} + path: {{ filepath.Base .destination_path | quote }} {{- end }} {{- end }} {{- end }} From e43f0f3f4ebde1faef439511755ee05390203645 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Wed, 27 May 2026 16:02:23 -0300 Subject: [PATCH 55/56] test(scheduled_task): add build_deployment render test for file params --- .../deployment/tests/build_deployment.bats | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 scheduled_task/deployment/tests/build_deployment.bats diff --git a/scheduled_task/deployment/tests/build_deployment.bats b/scheduled_task/deployment/tests/build_deployment.bats new file mode 100644 index 00000000..aa29dd2b --- /dev/null +++ b/scheduled_task/deployment/tests/build_deployment.bats @@ -0,0 +1,166 @@ +#!/usr/bin/env bats +# ============================================================================= +# Tests for scheduled_task/deployment/build_deployment. +# +# Mirrors k8s/deployment/tests/build_deployment.bats with a scheduled_task +# context (CronJob instead of Deployment). The same file-parameter regressions +# apply because scheduled_task reuses the k8s secret templates and ships its +# own deployment template that follows the same two-Secret + sanitized-name +# pattern. +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export OUTPUT_DIR="$(mktemp -d)" + export SCOPE_ID="scope-123" + export DEPLOYMENT_ID="deploy-456" + export REPLICAS="1" + + # scheduled_task reuses the k8s secret templates and ships its own + # deployment template under scheduled_task/deployment/templates/. + export DEPLOYMENT_TEMPLATE="$PROJECT_ROOT/scheduled_task/deployment/templates/deployment.yaml.tpl" + export SECRET_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret.yaml.tpl" + export SECRET_FILES_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret-files.yaml.tpl" + + export CONTEXT='{}' + + # Mock gomplate for orchestration tests (any test that doesn't `unset -f`). + gomplate() { + local out_file="" + while [[ $# -gt 0 ]]; do + case $1 in + --out) out_file="$2"; shift 2 ;; + *) shift ;; + esac + done + echo "apiVersion: v1" > "$out_file" + return 0 + } + export -f gomplate +} + +teardown() { + rm -rf "$OUTPUT_DIR" + unset -f gomplate +} + +# ============================================================================= +# File creation — confirms the script renders deployment + both Secrets +# ============================================================================= +@test "build_deployment: creates deployment file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" +} + +@test "build_deployment: creates secret file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" +} + +@test "build_deployment: creates secret-files file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" +} + +# ============================================================================= +# Rendering tests — real gomplate, assert on rendered output +# ============================================================================= +# Minimal context that satisfies the scheduled_task deployment template plus +# the shared k8s secret + secret-files templates. Includes a file param with +# (a) a display name that needs sanitizing and (b) a destination_path with a +# leading `[` to lock in YAML quoting at every insertion point. +_render_context() { + cat <<'JSON' +{ + "account": {"id": "acc1", "slug": "acct"}, + "namespace": {"id": "ns1", "slug": "nsps"}, + "application": {"id": "app1", "slug": "appslug"}, + "release": {"semver": "1.0.0"}, + "scope": { + "id": "scope-123", + "slug": "scopeslug", + "domain": "x.example.com", + "dimensions": {"env": "dev"}, + "capabilities": { + "cpu_millicores": 100, + "ram_memory": 128, + "cron": "*/5 * * * *", + "concurrency_policy": "Forbid", + "history_limit": {"successful": 3, "failed": 1}, + "retries": 0 + } + }, + "deployment": {"id": "deploy-456"}, + "k8s_namespace": "ns-test", + "k8s_modifiers": {}, + "asset": {"url": "example.com/app:latest"}, + "component": "app", + "service_account_name": "", + "pull_secrets": {"ENABLED": false, "SECRETS": []}, + "parameters": { + "results": [ + {"type": "environment", "variable": "MY_VAR", "values": [{"value": "hello"}]}, + {"type": "file", "name": "API P12 Cert!", "destination_path": "/app-data/[2026-05-27] cert.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} + ] + } +} +JSON +} + +@test "build_deployment: file-type parameter splits binary into a separate Secret" { + unset -f gomplate # use the real gomplate binary + + export CONTEXT="$(_render_context)" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + local secret_file="$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" + local deploy_file="$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" + + assert_file_exists "$secret_file" + assert_file_exists "$secret_files_file" + assert_file_exists "$deploy_file" + + # The envFrom Secret must not carry any file-related keys, otherwise the + # binary content would be injected as an env var and runc would reject it. + ! grep -E 'app-(data|file)-' "$secret_file" + + # The files Secret holds only the binary content under a sanitized key. + assert_contains "$(cat "$secret_files_file")" "name: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$secret_files_file")" "app-file-api-p12-cert: QUFBQkJC" + ! grep -E 'app-data-' "$secret_files_file" + + # The CronJob's application container gets a plain `env:` entry whose value + # is the destination path, plus a volume mount reading from the files Secret. + assert_contains "$(cat "$deploy_file")" "- name: app-data-api-p12-cert" + # Leading `[` in the path makes YAML parse the value as a flow sequence + # unless quoted — the four insertion points below all require quoting. + assert_contains "$(cat "$deploy_file")" 'value: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'mountPath: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'subPath: "[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'path: "[2026-05-27] cert.p12"' + + assert_contains "$(cat "$deploy_file")" "secretName: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$deploy_file")" "key: app-file-api-p12-cert" +} + +@test "build_deployment: secret-files renders empty when no file params" { + unset -f gomplate + + export CONTEXT="$(_render_context | jq '.parameters.results |= map(select(.type != "file"))')" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + # gomplate skips writing the output when the template renders empty; + # apply_templates handles missing/empty files gracefully. + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" + [ ! -f "$secret_files_file" ] || [ ! -s "$secret_files_file" ] +} From 6c6284f4bd0ece39eaa4c49a0fd3b9e705a07792 Mon Sep 17 00:00:00 2001 From: Federico Maleh Date: Mon, 8 Jun 2026 14:55:11 -0300 Subject: [PATCH 56/56] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d12793b..39b50a80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + +## [1.12.0] - 2026-06-08 - Fix: do not inject file parameter as env vars - Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage - Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports