diff --git a/.gitignore b/.gitignore index 10fe9d5c..10d7b0f7 100644 --- a/.gitignore +++ b/.gitignore @@ -151,3 +151,6 @@ testing/docker/certs/ # Claude Code .claude/ + +# Visual Studio Code +.vscode/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e0f4f23..39b50a80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +## [1.12.0] - 2026-06-08 +- Fix: do not inject file parameter as env vars +- Public and private scopes now register DNS records in their correct Route53 hosted zone when using `DNS_TYPE=external_dns`, preventing cross-zone record leakage +- Add configurable main HTTP port for k8s scopes (default 8080) and HTTP support for additional ports +- Improve **wait deployment active** failure logging: consolidate repeated `Unhealthy` probe events per pod into a single human-readable line, emit a progress heartbeat every 10% of timeout, and surface a targeted suggested fix based on the probe failure mode (port not open / HTTP non-2xx / probe timeout) +- Add configurable memory and CPU limits, independent from requests, for k8s scope containers +- Improve **k8s/diagnose** evidence: every check now emits structured evidence following a documented schema (`summary`, `severity`, `affected`, `details`, `suggested_actions`), failure findings embed the relevant pod log slice (current or previous depending on the failure mode), and a new **Application Logs** category surfaces the user-owned `application` container's log tail directly in the UI + ## [1.11.0] - 2026-04-16 - Add unit testing support - Add scope configuration diff --git a/k8s/deployment/build_context b/k8s/deployment/build_context index 0808681b..31880dc7 100755 --- a/k8s/deployment/build_context +++ b/k8s/deployment/build_context @@ -23,6 +23,20 @@ MIN_REPLICAS=$(echo "$MIN_REPLICAS" | awk '{printf "%d", ($1 == int($1) ? $1 : i DEPLOYMENT_STATUS=$(echo "$CONTEXT" | jq -r ".deployment.status") +# Fill in *_limit capability fields with the corresponding request value when +# the limit is missing or explicitly null, then clamp any limit below its +# request up to the request value. The schema rejects limit < request at save +# time; this is defense-in-depth so the script can never produce an invalid +# resources block, regardless of how the context was built. +normalize_capability_limits() { + echo "$1" | jq ' + .scope.capabilities.cpu_millicores_limit //= .scope.capabilities.cpu_millicores + | .scope.capabilities.ram_memory_limit //= .scope.capabilities.ram_memory + | .scope.capabilities.cpu_millicores_limit = ([.scope.capabilities.cpu_millicores, .scope.capabilities.cpu_millicores_limit] | max) + | .scope.capabilities.ram_memory_limit = ([.scope.capabilities.ram_memory, .scope.capabilities.ram_memory_limit] | max) + ' +} + validate_status() { local action="$1" local status="$2" @@ -245,6 +259,22 @@ if [[ -n "$TRAFFIC_MANAGER_CONFIG_MAP" ]]; then log info "✨ ConfigMap '$TRAFFIC_MANAGER_CONFIG_MAP' validation successful" fi +MAIN_HTTP_PORT=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') +log debug "🔍 main_http_port resolved to ${MAIN_HTTP_PORT}" + +# Enrich each additional_ports entry with traffic_manager_port = port + 10000. +# Convention: the traffic-manager sidecar that fronts an additional port binds +# +10000 inside the pod so the application can bind directly. The +# fixed +10000 offset makes it trivial to identify which sidecar belongs to +# which application port at a glance (e.g. app 8081 -> sidecar 18081). Keeping +# the math here (instead of in every template) means consumers just read +# .traffic_manager_port and never re-derive it. +CONTEXT=$(echo "$CONTEXT" | jq ' + if (.scope.capabilities.additional_ports | type) == "array" then + .scope.capabilities.additional_ports |= map(. + {traffic_manager_port: (.port + 10000)}) + else . end +') + # Check if blue deployment has K8s services for additional ports BLUE_ADDITIONAL_PORT_SERVICES="{}" if [ -n "$BLUE_DEPLOYMENT_ID" ] && [ "$BLUE_DEPLOYMENT_ID" != "null" ]; then @@ -280,6 +310,7 @@ CONTEXT=$(echo "$CONTEXT" | jq \ --arg container_memory_in_memory "$CONTAINER_MEMORY_IN_MEMORY" \ --arg container_cpu_in_millicores "$CONTAINER_CPU_IN_MILLICORES" \ --argjson blue_additional_port_services "$BLUE_ADDITIONAL_PORT_SERVICES" \ + --arg main_http_port "$MAIN_HTTP_PORT" \ '. + {blue_deployment_id: $blue_deployment_id, blue_replicas: $blue_replicas, green_replicas: $green_replicas, @@ -292,9 +323,12 @@ CONTEXT=$(echo "$CONTEXT" | jq \ traffic_manager_config_map: $traffic_manager_config_map, container_memory_in_memory: $container_memory_in_memory, container_cpu_in_millicores: $container_cpu_in_millicores, - blue_additional_port_services: $blue_additional_port_services + blue_additional_port_services: $blue_additional_port_services, + main_http_port: ($main_http_port | tonumber) }') +CONTEXT=$(normalize_capability_limits "$CONTEXT") + DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.deployment.id') OUTPUT_DIR="$SERVICE_PATH/output/$SCOPE_ID-$DEPLOYMENT_ID" diff --git a/k8s/deployment/build_deployment b/k8s/deployment/build_deployment index a51bf971..6a333b8e 100755 --- a/k8s/deployment/build_deployment +++ b/k8s/deployment/build_deployment @@ -3,6 +3,7 @@ DEPLOYMENT_PATH="$OUTPUT_DIR/deployment-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SECRET_PATH="$OUTPUT_DIR/secret-$SCOPE_ID-$DEPLOYMENT_ID.yaml" +SECRET_FILES_PATH="$OUTPUT_DIR/secret-files-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SCALING_PATH="$OUTPUT_DIR/scaling-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SERVICE_TEMPLATE_PATH="$OUTPUT_DIR/service-$SCOPE_ID-$DEPLOYMENT_ID.yaml" PDB_PATH="$OUTPUT_DIR/pdb-$SCOPE_ID-$DEPLOYMENT_ID.yaml" @@ -38,6 +39,18 @@ if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then fi log info " ✅ Secret template: $SECRET_PATH" +gomplate -c .="$CONTEXT_PATH" \ + --file "$SECRET_FILES_TEMPLATE" \ + --out "$SECRET_FILES_PATH" + +TEMPLATE_GENERATION_STATUS=$? + +if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then + log error " ❌ Failed to build secret-files template" + exit 1 +fi +log info " ✅ Secret-files template: $SECRET_FILES_PATH" + gomplate -c .="$CONTEXT_PATH" \ --file "$SCALING_TEMPLATE" \ --out "$SCALING_PATH" diff --git a/k8s/deployment/print_failed_deployment_hints b/k8s/deployment/print_failed_deployment_hints index 66ce5d51..33b08ff8 100644 --- a/k8s/deployment/print_failed_deployment_hints +++ b/k8s/deployment/print_failed_deployment_hints @@ -1,23 +1,310 @@ #!/bin/bash +# This file doubles as a sourceable library: +# - Sourced with PRINT_HINTS_LIB_ONLY=true → only function definitions load +# (used by wait_deployment_active to access the probe helpers during polling). +# - Sourced or executed without that flag → also runs the diagnostic main +# below (used by wait_deployment_active in the timeout/non-running paths). -HEALTH_CHECK_PATH=$(echo "$CONTEXT" | jq -r .scope.capabilities.health_check.path) -REQUESTED_MEMORY=$(echo "$CONTEXT" | jq -r .scope.capabilities.ram_memory) -SCOPE_NAME=$(echo "$CONTEXT" | jq -r .scope.name) -SCOPE_DIMENSIONS=$(echo "$CONTEXT" | jq -r .scope.dimensions) - -log error "" -log error "âš ī¸ Application Startup Issue Detected" -log error "" -log error "💡 Possible causes:" -log error " Your application was unable to start within the expected timeframe" -log error "" -log error "🔧 How to fix:" -log error " 1. Port Configuration: Ensure your application listens on port 8080" -log error " 2. Health Check Endpoint: Verify your app responds to: $HEALTH_CHECK_PATH" -log error " 3. Application Logs: Review logs for startup errors (database connections," -log error " missing dependencies, or initialization errors)" -log error " 4. Memory Allocation: Current allocation is ${REQUESTED_MEMORY}Mi - increase if needed" -log error " 5. Environment Variables: Verify all required variables are configured in" -log error " parameters for scope '$SCOPE_NAME' or dimensions: $SCOPE_DIMENSIONS" -log error "" +# ----------------------------------------------------------------------------- +# Probe event helpers +# ----------------------------------------------------------------------------- + +# Outputs the probe failure components as pipe-separated fields: kind|path|mode. +# Pipe (not whitespace) is used because `read -r a b c` with IFS containing +# whitespace would collapse consecutive separators and swallow empty fields. +# Mode is one of: "not yet listening", "responded HTTP (expected 2xx)", +# "request timed out", or "failed". Returns non-zero on non-probe messages. +parse_probe_message() { + local msg="$1" + [[ -z "$msg" ]] && return 1 + + local probe_kind="" + if [[ "$msg" == *"Startup probe failed"* ]]; then + probe_kind="Startup" + elif [[ "$msg" == *"Liveness probe failed"* ]]; then + probe_kind="Liveness" + elif [[ "$msg" == *"Readiness probe failed"* ]]; then + probe_kind="Readiness" + else + return 1 + fi + + local probe_path="" + if [[ "$msg" =~ http://[0-9.]+:[0-9]+([^\"[:space:]]+) ]]; then + probe_path="${BASH_REMATCH[1]}" + fi + + local mode_text="" + if [[ "$msg" == *"connection refused"* ]]; then + mode_text="not yet listening" + elif [[ "$msg" =~ statuscode:[[:space:]]*([0-9]+) ]]; then + mode_text="responded HTTP ${BASH_REMATCH[1]} (expected 2xx)" + elif [[ "$msg" == *"context deadline exceeded"* || "$msg" == *"Client.Timeout"* || "$msg" == *"i/o timeout"* ]]; then + mode_text="request timed out" + else + mode_text="failed" + fi + + printf "%s|%s|%s\n" "$probe_kind" "$probe_path" "$mode_text" +} + +# Strips the well-known "d---" prefix from a pod name, leaving +# the replicaset hash and pod suffix (the parts that distinguish replicas). +# A leading "..." marks the truncation so the operator can tell the displayed +# name is a tail, not the real pod name. Falls back to the full name when the +# prefix does not match. +short_pod_name() { + local name="$1" + local prefix="${K8S_DEPLOYMENT_NAME:-}" + if [[ -n "$prefix" && "$name" == "$prefix"-* ]]; then + echo "...${name#${prefix}-}" + else + echo "$name" + fi +} + +# Translates a Kubernetes probe failure message into a single user-friendly +# line. Echoes the translation on stdout; returns non-zero if the input is not +# a probe message so callers can fall back to the raw text. +translate_probe_message() { + local msg="$1" + [[ -z "$msg" ]] && return 1 + + local probe_kind="" + if [[ "$msg" == *"Startup probe failed"* ]]; then + probe_kind="Startup" + elif [[ "$msg" == *"Liveness probe failed"* ]]; then + probe_kind="Liveness" + elif [[ "$msg" == *"Readiness probe failed"* ]]; then + probe_kind="Readiness" + else + return 1 + fi + + local probe_path="" + if [[ "$msg" =~ http://[0-9.]+:[0-9]+([^\"[:space:]]+) ]]; then + probe_path="${BASH_REMATCH[1]}" + fi + local path_suffix="" + [[ -n "$probe_path" ]] && path_suffix=" on $probe_path" + + if [[ "$msg" == *"connection refused"* ]]; then + echo "${probe_kind} probe — app is not yet listening${path_suffix}" + elif [[ "$msg" =~ statuscode:[[:space:]]*([0-9]+) ]]; then + echo "${probe_kind} probe — app responded with HTTP ${BASH_REMATCH[1]}${path_suffix} (expected 2xx)" + elif [[ "$msg" == *"context deadline exceeded"* || "$msg" == *"Client.Timeout"* || "$msg" == *"i/o timeout"* ]]; then + echo "${probe_kind} probe — request timed out${path_suffix}" + else + echo "${probe_kind} probe failed${path_suffix}" + fi +} + +# ----------------------------------------------------------------------------- +# Diagnostic state and functions +# ----------------------------------------------------------------------------- + +ALL_EVENTS="${ALL_EVENTS:-{\"items\":[]}}" + +FAILURE_REASON="" +FAILURE_MESSAGE="" +FAILURE_CONTAINER="" +FAILURE_EXIT_CODE="" +TOP_EVENT_REASONS="" +UNHEALTHY_MESSAGE="" +HUMAN_MESSAGE="" +SUGGESTED_FIX="" + +diagnose_failure() { + local pods_json="" + if [[ -n "${K8S_NAMESPACE:-}" && -n "${DEPLOYMENT_ID:-}" ]] && command -v kubectl >/dev/null 2>&1; then + pods_json=$(kubectl get pods -n "$K8S_NAMESPACE" -l "deployment_id=${DEPLOYMENT_ID}" -o json 2>/dev/null) + fi + + if [[ -n "$pods_json" ]] && echo "$pods_json" | jq -e . >/dev/null 2>&1; then + FAILURE_REASON=$(echo "$pods_json" | jq -r ' + [.items[].status.containerStatuses[]? + | (.state.waiting.reason // .lastState.terminated.reason // empty) + ] | map(select(. != "" and . != "Completed")) | + group_by(.) | max_by(length) | .[0] // empty' 2>/dev/null) + + if [[ -n "$FAILURE_REASON" ]]; then + FAILURE_MESSAGE=$(echo "$pods_json" | jq -r --arg r "$FAILURE_REASON" ' + [.items[].status.containerStatuses[]? + | select(.state.waiting.reason == $r or .lastState.terminated.reason == $r) + | (.state.waiting.message // .lastState.terminated.message // empty) + ] | map(select(. != "")) | .[0] // empty' 2>/dev/null) + + FAILURE_CONTAINER=$(echo "$pods_json" | jq -r --arg r "$FAILURE_REASON" ' + [.items[].status.containerStatuses[]? + | select(.state.waiting.reason == $r or .lastState.terminated.reason == $r) + | .name + ] | .[0] // empty' 2>/dev/null) + + FAILURE_EXIT_CODE=$(echo "$pods_json" | jq -r --arg r "$FAILURE_REASON" ' + [.items[].status.containerStatuses[]? + | select(.lastState.terminated.reason == $r) + | .lastState.terminated.exitCode + ] | map(select(. != null)) | .[0] // empty' 2>/dev/null) + fi + fi + + TOP_EVENT_REASONS=$(echo "$ALL_EVENTS" | jq -r ' + .items | map(select(.type == "Warning")) | + group_by(.reason) | + map({reason: .[0].reason, count: length}) | + sort_by(-.count) | .[0:3][] | + " â€ĸ \(.reason) (×\(.count))"' 2>/dev/null) + + UNHEALTHY_MESSAGE=$(echo "$ALL_EVENTS" | jq -r ' + .items + | map(select(.type == "Warning" and .reason == "Unhealthy")) + | sort_by(.eventTime // .lastTimestamp // .firstTimestamp // .metadata.creationTimestamp // "") + | last + | .message // empty' 2>/dev/null) + + if [[ -z "$FAILURE_REASON" ]]; then + FAILURE_REASON=$(echo "$ALL_EVENTS" | jq -r ' + .items | map(select(.type == "Warning")) | + group_by(.reason) | max_by(length) | .[0].reason // empty' 2>/dev/null) + fi + + local req_memory scope_name health_check_path + req_memory=$(echo "${CONTEXT:-{}}" | jq -r '.scope.capabilities.ram_memory // empty' 2>/dev/null) + scope_name=$(echo "${CONTEXT:-{}}" | jq -r '.scope.name // empty' 2>/dev/null) + health_check_path=$(echo "${CONTEXT:-{}}" | jq -r '.scope.capabilities.health_check.path // "/"' 2>/dev/null) + + case "$FAILURE_REASON" in + ImagePullBackOff|ErrImagePull) + HUMAN_MESSAGE="The container image could not be pulled." + SUGGESTED_FIX="Verify the image name, tag, and registry credentials are correct." ;; + CrashLoopBackOff|BackOff) + HUMAN_MESSAGE="The container started and crashed repeatedly." + SUGGESTED_FIX="Review application logs for startup errors (failed dependencies, bad config, panics)." ;; + OOMKilled) + if [[ -n "$req_memory" ]]; then + HUMAN_MESSAGE="The container exceeded its memory limit (${req_memory}Mi) and was terminated." + else + HUMAN_MESSAGE="The container exceeded its memory limit and was terminated." + fi + SUGGESTED_FIX="Increase ram_memory for scope '$scope_name' or reduce application memory usage." ;; + CreateContainerConfigError) + HUMAN_MESSAGE="The container configuration is invalid." + SUGGESTED_FIX="Check for missing secrets or configmaps referenced by the deployment." ;; + CreateContainerError) + HUMAN_MESSAGE="Kubernetes could not create the container." + SUGGESTED_FIX="Check volumes, permissions, and the pod spec for errors." ;; + RunContainerError) + HUMAN_MESSAGE="The container failed to run its entrypoint." + SUGGESTED_FIX="Verify the start command and that required binaries exist in the image." ;; + ContainerCannotRun) + HUMAN_MESSAGE="The startup binary is missing or not executable inside the image." + SUGGESTED_FIX="Rebuild the image ensuring the entrypoint exists and has execute permissions." ;; + FailedScheduling) + HUMAN_MESSAGE="No node has enough resources or matches the pod's scheduling constraints." + SUGGESTED_FIX="Reduce requested resources, free cluster capacity, or review nodeSelector/affinity rules." ;; + FailedMount|FailedAttachVolume) + HUMAN_MESSAGE="A volume could not be mounted onto the pod." + SUGGESTED_FIX="Check that the referenced PVC, secret, or configmap exists and is accessible." ;; + Unhealthy) + HUMAN_MESSAGE="The application did not pass its health check at $health_check_path." + if [[ -n "$UNHEALTHY_MESSAGE" ]]; then + local translated="" + translated=$(translate_probe_message "$UNHEALTHY_MESSAGE" 2>/dev/null) || translated="" + if [[ -n "$translated" ]]; then + HUMAN_MESSAGE="$HUMAN_MESSAGE Detected: $translated." + else + # Fallback: surface the raw K8s message so context is not lost + HUMAN_MESSAGE="$HUMAN_MESSAGE Detected: $UNHEALTHY_MESSAGE" + fi + fi + if [[ "$UNHEALTHY_MESSAGE" == *"connection refused"* ]]; then + SUGGESTED_FIX="The container is not listening on port 8080 — verify the start command runs, the process binds to 0.0.0.0:8080, and nothing is crashing before it accepts connections." + elif [[ "$UNHEALTHY_MESSAGE" =~ statuscode:[[:space:]]*([0-9]+) ]]; then + SUGGESTED_FIX="The app responded with HTTP ${BASH_REMATCH[1]} on $health_check_path — inspect application logs for startup errors; the process is running but $health_check_path is not returning 2xx." + elif [[ "$UNHEALTHY_MESSAGE" == *"context deadline exceeded"* || "$UNHEALTHY_MESSAGE" == *"Client.Timeout"* || "$UNHEALTHY_MESSAGE" == *"i/o timeout"* ]]; then + SUGGESTED_FIX="The probe timed out — the app may be slow to start or $health_check_path is blocking. Consider increasing startup probe initialDelaySeconds/timeoutSeconds, or making $health_check_path lighter." + else + SUGGESTED_FIX="Ensure the app listens on port 8080 and returns 2xx on $health_check_path within the readiness window." + fi ;; + FailedCreate|FailedCreatePodSandBox) + HUMAN_MESSAGE="Kubernetes could not create the pod sandbox." + SUGGESTED_FIX="Check node health, CNI configuration, and pod security policies." ;; + "") + HUMAN_MESSAGE="" + SUGGESTED_FIX="" ;; + *) + HUMAN_MESSAGE="Pods are failing with reason: $FAILURE_REASON" + SUGGESTED_FIX="" ;; + esac +} + +print_specific_diagnostics() { + if [[ -n "$HUMAN_MESSAGE" ]]; then + log error "" + log error "📋 Reason: $HUMAN_MESSAGE" + fi + + if [[ -n "$FAILURE_REASON" && -n "$FAILURE_CONTAINER" ]]; then + local detail="📋 Detected: $FAILURE_REASON on container $FAILURE_CONTAINER" + if [[ -n "$FAILURE_EXIT_CODE" ]]; then + detail="$detail (exit $FAILURE_EXIT_CODE)" + fi + log error "$detail" + elif [[ -n "$FAILURE_REASON" ]]; then + log error "📋 Detected: $FAILURE_REASON" + fi + + if [[ -n "$FAILURE_MESSAGE" ]]; then + log error "📋 Details: $FAILURE_MESSAGE" + fi + + if [[ -n "${desired:-}" ]]; then + log error "📊 Progress at failure: ${ready:-0}/${desired} ready, ${current:-0}/${desired} available" + fi + + if [[ -n "$TOP_EVENT_REASONS" ]]; then + log error "📋 Recent warnings:" + while IFS= read -r line; do + [[ -n "$line" ]] && log error "$line" + done <<< "$TOP_EVENT_REASONS" + fi + + if [[ -n "$SUGGESTED_FIX" ]]; then + log error "💡 Suggested fix: $SUGGESTED_FIX" + fi +} + +print_generic_hints() { + local health_check_path requested_memory scope_name scope_dimensions + health_check_path=$(echo "$CONTEXT" | jq -r .scope.capabilities.health_check.path) + requested_memory=$(echo "$CONTEXT" | jq -r .scope.capabilities.ram_memory) + scope_name=$(echo "$CONTEXT" | jq -r .scope.name) + scope_dimensions=$(echo "$CONTEXT" | jq -r .scope.dimensions) + + log error "" + log error "âš ī¸ Application Startup Issue Detected" + log error "" + log error "💡 Possible causes:" + log error " Your application was unable to start within the expected timeframe" + log error "" + log error "🔧 How to fix:" + log error " 1. Port Configuration: Ensure your application listens on port 8080" + log error " 2. Health Check Endpoint: Verify your app responds to: $health_check_path" + log error " 3. Application Logs: Review logs for startup errors (database connections," + log error " missing dependencies, or initialization errors)" + log error " 4. Memory Allocation: Current allocation is ${requested_memory}Mi - increase if needed" + log error " 5. Environment Variables: Verify all required variables are configured in" + log error " parameters for scope '$scope_name' or dimensions: $scope_dimensions" + log error "" +} + +# Run the diagnostic main only when not being sourced just for the helpers. +if [[ "${PRINT_HINTS_LIB_ONLY:-false}" != "true" ]]; then + diagnose_failure + print_specific_diagnostics + + if [[ -z "$SUGGESTED_FIX" ]]; then + print_generic_hints + fi +fi diff --git a/k8s/deployment/templates/blue-green-ingress.yaml.tpl b/k8s/deployment/templates/blue-green-ingress.yaml.tpl index 20a0a5b0..c35f53fe 100644 --- a/k8s/deployment/templates/blue-green-ingress.yaml.tpl +++ b/k8s/deployment/templates/blue-green-ingress.yaml.tpl @@ -30,8 +30,8 @@ metadata: annotations: alb.ingress.kubernetes.io/actions.bg-deployment: >- {"type":"forward","forwardConfig":{"targetGroups":[ - {"serviceName":"d-{{ .scope.id }}-{{ .blue_deployment_id }}","servicePort":8080,"weight":{{ sub 100 .deployment.strategy_data.desired_switched_traffic }}}, - {"serviceName":"d-{{ .scope.id }}-{{ .deployment.id }}","servicePort":8080,"weight":{{ .deployment.strategy_data.desired_switched_traffic }}} + {"serviceName":"d-{{ .scope.id }}-{{ .blue_deployment_id }}","servicePort":{{ .main_http_port }},"weight":{{ sub 100 .deployment.strategy_data.desired_switched_traffic }}}, + {"serviceName":"d-{{ .scope.id }}-{{ .deployment.id }}","servicePort":{{ .main_http_port }},"weight":{{ .deployment.strategy_data.desired_switched_traffic }}} ]}} alb.ingress.kubernetes.io/actions.response-404: '{"type":"fixed-response","fixedResponseConfig":{"contentType":"text/plain","statusCode":"404","messageBody":"404 scope not found or has not been deployed yet"}}' alb.ingress.kubernetes.io/group.name: {{ .alb_name }} @@ -145,8 +145,7 @@ metadata: alb.ingress.kubernetes.io/target-node-labels: account={{ $.account.slug }},namespace={{ $.namespace.slug }},application={{ $.application.slug }},account_id={{ $.account.id }},namespace_id={{ $.namespace.id }},application_id={{ $.application.id }},scope={{ $.scope.slug }},scope_id={{ $.scope.id }},nullplatform=true alb.ingress.kubernetes.io/target-type: ip {{ if eq .type "HTTP" }} - alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80},{"HTTPS":443}]' - alb.ingress.kubernetes.io/ssl-redirect: "443" + alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' {{ else if eq .type "GRPC" }} alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' alb.ingress.kubernetes.io/backend-protocol-version: GRPC diff --git a/k8s/deployment/templates/deployment.yaml.tpl b/k8s/deployment/templates/deployment.yaml.tpl index 6acf1c95..cce128bc 100644 --- a/k8s/deployment/templates/deployment.yaml.tpl +++ b/k8s/deployment/templates/deployment.yaml.tpl @@ -135,6 +135,8 @@ spec: - containerPort: 80 protocol: TCP env: + - name: UPSTREAM_PORT + value: '{{ .main_http_port }}' - name: HEALTH_CHECK_TYPE value: http - name: GRACE_PERIOD @@ -151,7 +153,7 @@ spec: cpu: 31m livenessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" 8080 }} + {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" .main_http_port }} {{- else }} {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 80 }} {{- end }} @@ -159,7 +161,7 @@ spec: failureThreshold: 9 readinessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" 8080 }} + {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" .main_http_port }} {{- else }} {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 80 }} {{- end }} @@ -167,7 +169,7 @@ spec: failureThreshold: 3 startupProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" 8080 }} + {{- template "probe.tcp" dict "healthCheck" .scope.capabilities.health_check "traffic_port" 80 "app_port" .main_http_port }} {{- else }} {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 80 }} {{- end }} @@ -229,6 +231,63 @@ spec: terminationMessagePath: /dev/termination-log terminationMessagePolicy: File imagePullPolicy: Always + {{ else if eq .type "HTTP" }} + - name: http-{{ .port }} + securityContext: + runAsUser: 0 + image: {{ $.traffic_image }} + ports: + - containerPort: {{ .traffic_manager_port }} + protocol: TCP + env: + - name: UPSTREAM_PORT + value: '{{ .port }}' + - name: HEALTH_CHECK_TYPE + value: http + - name: GRACE_PERIOD + value: '15' + - name: LISTENER_PROTOCOL + value: http + - name: LISTENER_PORT + value: '{{ .traffic_manager_port }}' + - name: HEALTH_CHECK_PATH + value: {{ $.scope.capabilities.health_check.path }} + resources: + limits: + cpu: {{ $.container_cpu_in_millicores }}m + memory: {{ $.container_memory_in_memory }}Mi + requests: + cpu: 31m + livenessProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .traffic_manager_port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 9 + readinessProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .traffic_manager_port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: {{ $.scope.capabilities.health_check.path }} + port: {{ .traffic_manager_port }} + timeoutSeconds: 5 + periodSeconds: 10 + initialDelaySeconds: {{ $.scope.capabilities.health_check.initial_delay_seconds }} + successThreshold: 1 + failureThreshold: 90 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + imagePullPolicy: Always {{ end }} {{ end }} {{ end }} @@ -236,47 +295,59 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} + {{- if .parameters.results }} + env: + {{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "app-data-%s" $key }} + value: {{ .destination_path | quote }} + {{- end }} + {{- end }} + {{- end }} image: >- {{ .asset.url }} securityContext: runAsUser: 0 ports: - - containerPort: 8080 + - containerPort: {{ .main_http_port }} protocol: TCP {{ if .scope.capabilities.additional_ports }} {{ range .scope.capabilities.additional_ports }} + {{ if eq .type "HTTP" }} - containerPort: {{ .port }} protocol: TCP {{ end }} {{ end }} + {{ end }} resources: limits: - cpu: {{ .scope.capabilities.cpu_millicores }}m - memory: {{ .scope.capabilities.ram_memory }}Mi + cpu: {{ .scope.capabilities.cpu_millicores_limit }}m + memory: {{ .scope.capabilities.ram_memory_limit }}Mi requests: cpu: {{ .scope.capabilities.cpu_millicores }}m memory: {{ .scope.capabilities.ram_memory }}Mi livenessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.app_tcp" dict "port" 8080 }} + {{- template "probe.app_tcp" dict "port" .main_http_port }} {{- else }} - {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 8080 }} + {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" .main_http_port }} {{- end }} {{- template "probe.base" dict "healthCheck" .scope.capabilities.health_check }} failureThreshold: 6 readinessProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.app_tcp" dict "port" 8080 }} + {{- template "probe.app_tcp" dict "port" .main_http_port }} {{- else }} - {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 8080 }} + {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" .main_http_port }} {{- end }} {{- template "probe.base" dict "healthCheck" .scope.capabilities.health_check }} failureThreshold: 3 startupProbe: {{- if and (has .scope.capabilities.health_check "type") (eq .scope.capabilities.health_check.type "TCP") }} - {{- template "probe.app_tcp" dict "port" 8080 }} + {{- template "probe.app_tcp" dict "port" .main_http_port }} {{- else }} - {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" 8080 }} + {{- template "probe.http" dict "healthCheck" .scope.capabilities.health_check "port" .main_http_port }} {{- end }} {{- template "probe.base" dict "healthCheck" .scope.capabilities.health_check }} failureThreshold: 90 @@ -294,9 +365,10 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-" | strings.ReplaceAll "_" "-") }} - mountPath: {{ .destination_path }} - subPath: {{ filepath.Base .destination_path }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} + mountPath: {{ .destination_path | quote }} + subPath: {{ filepath.Base .destination_path | quote }} readOnly: true {{- end }} {{- end }} @@ -312,12 +384,13 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-" | strings.ReplaceAll "_" "-") }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} secret: - secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }} + secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - - key: {{ printf "app-data-%s" (filepath.Base .destination_path) }} - path: {{ filepath.Base .destination_path }} + - key: {{ printf "app-file-%s" $key }} + path: {{ filepath.Base .destination_path | quote }} {{- end }} {{- end }} {{- end }} diff --git a/k8s/deployment/templates/dns-endpoint.yaml.tpl b/k8s/deployment/templates/dns-endpoint.yaml.tpl index e68e1903..0e8ccf45 100644 --- a/k8s/deployment/templates/dns-endpoint.yaml.tpl +++ b/k8s/deployment/templates/dns-endpoint.yaml.tpl @@ -1,7 +1,7 @@ apiVersion: externaldns.k8s.io/v1alpha1 kind: DNSEndpoint metadata: - name: k-8-s-{{ .scope.slug }}-{{ .scope.id }}-dns + name: k8s-{{ .application.slug | strings.Trunc 20 | strings.TrimSuffix "-" }}-{{ .scope.slug | strings.Trunc 20 | strings.TrimSuffix "-" }}-{{ .scope.id }}-dns namespace: {{ .k8s_namespace }} labels: nullplatform: "true" @@ -13,10 +13,11 @@ metadata: application_id: "{{ .application.id }}" scope: {{ .scope.slug }} scope_id: "{{ .scope.id }}" + dns/zone-type: {{ .dns_zone_type | default "public" }} spec: endpoints: - dnsName: {{ .scope.domain }} recordTTL: 60 - recordType: A + recordType: {{ .record_type }} targets: - "{{ .gateway_ip }}" diff --git a/k8s/deployment/templates/initial-ingress.yaml.tpl b/k8s/deployment/templates/initial-ingress.yaml.tpl index d2d099ac..088a1eaf 100644 --- a/k8s/deployment/templates/initial-ingress.yaml.tpl +++ b/k8s/deployment/templates/initial-ingress.yaml.tpl @@ -62,7 +62,7 @@ spec: service: name: d-{{ .scope.id }}-{{ .deployment.id }} port: - number: 8080 + number: {{ .main_http_port }} {{- range .scope.domains }} - host: {{ .name }} http: @@ -73,7 +73,7 @@ spec: service: name: d-{{ $.scope.id }}-{{ $.deployment.id }} port: - number: 8080 + number: {{ $.main_http_port }} {{- end }} {{ if .scope.capabilities.additional_ports }} {{ range .scope.capabilities.additional_ports }} @@ -110,7 +110,7 @@ metadata: annotations: alb.ingress.kubernetes.io/group.name: {{ $.alb_name }} {{ if eq .type "HTTP" }} - alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80},{"HTTPS":443}]' + alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' {{ else if eq .type "GRPC" }} alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{{ .port }}}]' alb.ingress.kubernetes.io/backend-protocol-version: GRPC diff --git a/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl b/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl index 5f45ad58..d3beef1e 100644 --- a/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl +++ b/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl @@ -59,13 +59,13 @@ spec: - group: "" kind: Service name: d-{{ .scope.id }}-{{ .blue_deployment_id }} - port: 8080 + port: {{ .main_http_port }} weight: {{ sub 100 .deployment.strategy_data.desired_switched_traffic }} # Green deployment (new version) - group: "" kind: Service name: d-{{ .scope.id }}-{{ .deployment.id }} - port: 8080 + port: {{ .main_http_port }} weight: {{ .deployment.strategy_data.desired_switched_traffic }} matches: - path: diff --git a/k8s/deployment/templates/istio/initial-httproute.yaml.tpl b/k8s/deployment/templates/istio/initial-httproute.yaml.tpl index f300a5d3..245e414e 100644 --- a/k8s/deployment/templates/istio/initial-httproute.yaml.tpl +++ b/k8s/deployment/templates/istio/initial-httproute.yaml.tpl @@ -58,7 +58,7 @@ spec: - group: "" kind: Service name: d-{{ .scope.id }}-{{ .deployment.id }} - port: 8080 + port: {{ .main_http_port }} weight: 1 matches: - path: diff --git a/k8s/deployment/templates/istio/service.yaml.tpl b/k8s/deployment/templates/istio/service.yaml.tpl index 051579e4..5a055581 100644 --- a/k8s/deployment/templates/istio/service.yaml.tpl +++ b/k8s/deployment/templates/istio/service.yaml.tpl @@ -41,7 +41,7 @@ metadata: spec: ports: - protocol: TCP - port: 8080 + port: {{ .main_http_port }} targetPort: 80 selector: nullplatform: "true" diff --git a/k8s/deployment/templates/secret-files.yaml.tpl b/k8s/deployment/templates/secret-files.yaml.tpl new file mode 100644 index 00000000..883a3f66 --- /dev/null +++ b/k8s/deployment/templates/secret-files.yaml.tpl @@ -0,0 +1,49 @@ +{{- $hasFile := false -}} +{{- if .parameters.results -}} + {{- range .parameters.results -}} + {{- if and (eq .type "file") (gt (len .values) 0) -}} + {{- $hasFile = true -}} + {{- end -}} + {{- end -}} +{{- end -}} +{{- if $hasFile -}} +apiVersion: v1 +kind: Secret +immutable: true +metadata: + name: s-{{ .scope.id }}-d-{{ .deployment.id }}-files + namespace: {{ .k8s_namespace }} + labels: + nullplatform: "true" + account: {{ .account.slug }} + account_id: "{{ .account.id }}" + namespace: {{ .namespace.slug }} + namespace_id: "{{ .namespace.id }}" + application: {{ .application.slug }} + application_id: "{{ .application.id }}" + scope: {{ .scope.slug }} + scope_id: "{{ .scope.id }}" + deployment_id: "{{ .deployment.id }}" +{{- $global := index .k8s_modifiers "global" }} +{{- if $global }} + {{- $labels := index $global "labels" }} + {{- if $labels }} +{{ data.ToYAML $labels | indent 4 }} + {{- end }} +{{- end }} +{{- $secret := index .k8s_modifiers "secret" }} +{{- if $secret }} + {{- $labels := index $secret "labels" }} + {{- if $labels }} +{{ data.ToYAML $labels | indent 4 }} + {{- end }} +{{- end }} +data: +{{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + {{ printf "app-file-%s" $key }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} + {{- end }} +{{- end }} +type: Opaque +{{- end -}} diff --git a/k8s/deployment/templates/secret.yaml.tpl b/k8s/deployment/templates/secret.yaml.tpl index baa9564d..59028c66 100644 --- a/k8s/deployment/templates/secret.yaml.tpl +++ b/k8s/deployment/templates/secret.yaml.tpl @@ -37,11 +37,6 @@ data: {{ .variable }}: {{ index .values 0 "value" | base64.Encode }} {{- end }} {{- end }} - {{- if and (eq .type "file") }} - {{- if gt (len .values) 0 }} - {{ printf "app-data-%s" (filepath.Base .destination_path) }}: {{ index .values 0 "value" | regexp.Replace "^data:[^;]+;base64," "" }} - {{- end }} - {{- end }} {{- end }} {{- end }} NP_ACCOUNT: {{ .account.slug | base64.Encode }} diff --git a/k8s/deployment/templates/service.yaml.tpl b/k8s/deployment/templates/service.yaml.tpl index a9299fb3..ab74d346 100644 --- a/k8s/deployment/templates/service.yaml.tpl +++ b/k8s/deployment/templates/service.yaml.tpl @@ -57,7 +57,7 @@ metadata: spec: ports: - protocol: TCP - port: 8080 + port: {{ .main_http_port }} targetPort: 80 selector: nullplatform: "true" @@ -101,14 +101,14 @@ metadata: scope: {{ $.scope.slug }} scope_id: "{{ $.scope.id }}" deployment_id: "{{ $.deployment.id }}" -{{- $global := index .k8s_modifiers "global" }} +{{- $global := index $.k8s_modifiers "global" }} {{- if $global }} {{- $labels := index $global "labels" }} {{- if $labels }} {{ data.ToYAML $labels | indent 4 }} {{- end }} {{- end }} -{{- $service := index .k8s_modifiers "service" }} +{{- $service := index $.k8s_modifiers "service" }} {{- if $service }} {{- $labels := index $service "labels" }} {{- if $labels }} @@ -124,14 +124,14 @@ metadata: alb.ingress.kubernetes.io/success-codes: 200-299 alb.ingress.kubernetes.io/unhealthy-threshold-count: '3' alb.ingress.kubernetes.io/backend-protocol: HTTP -{{- $global := index .k8s_modifiers "global" }} +{{- $global := index $.k8s_modifiers "global" }} {{- if $global }} {{- $annotations := index $global "annotations" }} {{- if $annotations }} {{ data.ToYAML $annotations | indent 4 }} {{- end }} {{- end }} -{{- $service := index .k8s_modifiers "service" }} +{{- $service := index $.k8s_modifiers "service" }} {{- if $service }} {{- $annotations := index $service "annotations" }} {{- if $annotations }} @@ -142,7 +142,7 @@ spec: ports: - protocol: TCP port: {{ .port }} - targetPort: {{ .port }} + targetPort: {{ .traffic_manager_port }} selector: nullplatform: "true" account: {{ $.account.slug }} diff --git a/k8s/deployment/tests/build_context.bats b/k8s/deployment/tests/build_context.bats index ce8aa579..72d9d020 100644 --- a/k8s/deployment/tests/build_context.bats +++ b/k8s/deployment/tests/build_context.bats @@ -800,3 +800,241 @@ SCRIPT assert_equal "$grpc_exists" "true" assert_equal "$http_exists" "false" } + +# ============================================================================= +# main_http_port extraction tests (CLIEN-739) +# ============================================================================= + +@test "main_http_port: defaults to 8080 when capability missing" { + CONTEXT='{"scope":{"capabilities":{}}}' + result=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') + assert_equal "$result" "8080" +} + +@test "main_http_port: defaults to 8080 when capability is null" { + CONTEXT='{"scope":{"capabilities":{"main_http_port":null}}}' + result=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') + assert_equal "$result" "8080" +} + +@test "main_http_port: respects explicit value when set" { + CONTEXT='{"scope":{"capabilities":{"main_http_port":9090}}}' + result=$(echo "$CONTEXT" | jq -r '.scope.capabilities.main_http_port // 8080') + assert_equal "$result" "9090" +} + +@test "main_http_port: jq cast to number preserves integer type for templates" { + MAIN_HTTP_PORT="9090" + result=$(echo '{}' | jq --arg main_http_port "$MAIN_HTTP_PORT" '. + {main_http_port: ($main_http_port | tonumber)} | .main_http_port') + assert_equal "$result" "9090" +} + +# ============================================================================= +# additional_ports enrichment: traffic_manager_port = port + 10000 +# These tests source the real deployment/build_context and assert on the +# resulting CONTEXT, so the entire pipeline (scope/build_context -> deployment +# enrichment) is exercised. +# ============================================================================= + +# Stages the full environment needed to source deployment/build_context: +# external commands (kubectl, aws) mocked, required env vars set, and CONTEXT +# pre-populated with a deployment that satisfies validate_status. The caller +# patches CONTEXT.scope.capabilities.additional_ports for the case under test. +setup_full_build_context() { + export SERVICE_PATH="$PROJECT_ROOT/k8s" + export SCRIPT="$PROJECT_ROOT/k8s/deployment/build_context" + export NP_OUTPUT_DIR="$(mktemp -d)" + export SERVICE_ACTION="start-initial" + # Skip the route53 / additional-balancer code paths that would call `aws`. + export DNS_TYPE="external_dns" + + kubectl() { + case "$1 $2" in + "get namespace") return 0 ;; + "get service") return 1 ;; # no blue services -> empty map, harmless + *) return 0 ;; + esac + } + export -f kubectl + + export CONTEXT='{ + "scope": { + "id": "test-scope-123", + "nrn": "nrn:organization=100:account=200:namespace=300:application=400", + "domain": "test.nullapps.io", + "capabilities": { + "visibility": "public", + "scaling_type": "fixed", + "fixed_instances": 2, + "protocol": "http" + } + }, + "namespace": {"slug": "test-namespace"}, + "application": {"slug": "test-app"}, + "deployment": {"id": "deploy-123", "status": "creating"}, + "providers": { + "cloud-providers": {"account": {"region": "us-east-1"}}, + "container-orchestration": { + "cluster": {"namespace": "default-namespace"}, + "gateway": {"public_name": "gw-pub", "private_name": "gw-priv"}, + "balancer": {"public_name": "alb-pub", "private_name": "alb-priv"} + } + } + }' +} + +# Patches CONTEXT.scope.capabilities.additional_ports with the given JSON +# fragment (raw jq value, e.g. '[{"port":8081,"type":"HTTP"}]' or 'null'). +set_additional_ports() { + CONTEXT=$(echo "$CONTEXT" | jq --argjson v "$1" '.scope.capabilities.additional_ports = $v') +} + +@test "traffic_manager_port: derived as port + 10000 for every additional_ports entry" { + setup_full_build_context + set_additional_ports '[{"port":8081,"type":"HTTP"},{"port":9014,"type":"GRPC"}]' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].traffic_manager_port')" "18081" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[1].traffic_manager_port')" "19014" +} + +@test "traffic_manager_port: preserves original port and type fields" { + setup_full_build_context + set_additional_ports '[{"port":8081,"type":"HTTP"}]' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].port')" "8081" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].type')" "HTTP" +} + +@test "traffic_manager_port: emitted as JSON number (not string) for Go template consumption" { + setup_full_build_context + set_additional_ports '[{"port":8081,"type":"HTTP"}]' + + source "$SCRIPT" + + local jq_type + jq_type=$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports[0].traffic_manager_port | type') + assert_equal "$jq_type" "number" +} + +@test "traffic_manager_port: noop when additional_ports is absent" { + setup_full_build_context + CONTEXT=$(echo "$CONTEXT" | jq 'del(.scope.capabilities.additional_ports)') + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports')" "null" +} + +@test "traffic_manager_port: noop when additional_ports is null" { + setup_full_build_context + set_additional_ports 'null' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.additional_ports')" "null" +} + +@test "traffic_manager_port: noop when additional_ports is empty array" { + setup_full_build_context + set_additional_ports '[]' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -c '.scope.capabilities.additional_ports')" "[]" +} + +# ============================================================================= +# Capability limits normalization +# These tests source the real deployment/build_context and assert on the +# resulting CONTEXT, exercising the full pipeline. Limits default to their +# corresponding request value when missing or explicitly null; explicit values +# pass through. +# ============================================================================= + +# Patches CONTEXT.scope.capabilities with the given JSON fragment (merged into +# the existing capabilities object). +set_capabilities() { + CONTEXT=$(echo "$CONTEXT" | jq --argjson v "$1" '.scope.capabilities = (.scope.capabilities + $v)') +} + +@test "capability limits: cpu limit defaults to cpu_millicores when absent" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"ram_memory":1024,"ram_memory_limit":2048}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" +} + +@test "capability limits: ram limit defaults to ram_memory when absent" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":1000,"ram_memory":1024}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "capability limits: both limits default to their requests when both absent" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"ram_memory":1024}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "capability limits: explicit null limits fall back to their requests" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":null,"ram_memory":1024,"ram_memory_limit":null}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "capability limits: explicit non-null limits pass through unchanged" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":2000,"ram_memory":1024,"ram_memory_limit":4096}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "2000" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "4096" +} + +@test "capability limits: cpu limit below request is clamped up to request" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":100,"ram_memory":1024,"ram_memory_limit":2048}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "2048" +} + +@test "capability limits: ram limit below request is clamped up to request" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":1000,"ram_memory":1024,"ram_memory_limit":64}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "1000" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} + +@test "capability limits: both limits below their requests are clamped up" { + setup_full_build_context + set_capabilities '{"cpu_millicores":500,"cpu_millicores_limit":100,"ram_memory":1024,"ram_memory_limit":64}' + + source "$SCRIPT" + + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.cpu_millicores_limit')" "500" + assert_equal "$(echo "$CONTEXT" | jq -r '.scope.capabilities.ram_memory_limit')" "1024" +} diff --git a/k8s/deployment/tests/build_deployment.bats b/k8s/deployment/tests/build_deployment.bats index f010afce..41adfe2a 100644 --- a/k8s/deployment/tests/build_deployment.bats +++ b/k8s/deployment/tests/build_deployment.bats @@ -18,6 +18,7 @@ setup() { # Template paths export DEPLOYMENT_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/deployment.yaml.tpl" export SECRET_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret.yaml.tpl" + export SECRET_FILES_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret-files.yaml.tpl" export SCALING_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/scaling.yaml.tpl" export SERVICE_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/service.yaml.tpl" export PDB_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/pdb.yaml.tpl" @@ -143,6 +144,13 @@ teardown() { assert_file_exists "$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" } +@test "build_deployment: creates secret-files file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" +} + @test "build_deployment: creates scaling file with correct name" { run bash "$BATS_TEST_DIRNAME/../build_deployment" @@ -170,3 +178,136 @@ teardown() { [ "$status" -eq 0 ] [ ! -f "$OUTPUT_DIR/context-scope-123.json" ] } + +# ============================================================================= +# Rendering Tests — real gomplate, assert on rendered output +# ============================================================================= +# These tests run the actual `gomplate` binary against the templates and +# verify the rendered Secret + Deployment YAML have the right shape. +# +# Regression guard for the file-type parameter bug: binary file content used +# to be stored under Secret key `app-data-` in the env-var Secret, +# which then leaked into the container env block via `envFrom`, which runc +# rejects with `invalid environment variable ... contains nul byte`. The fix +# splits the storage into two Secrets: +# - s--d- env-only, consumed via envFrom (safe) +# - s--d--files binary-only, consumed only by the volume mount +# Plus a plain `env:` entry on the application container that carries the +# file's destination path under name `app-data-`. + +# Minimal context that satisfies all five templates' required fields. +# Includes both an `environment` and a `file` parameter so we can assert on +# the file-specific keys without ignoring the rest of the Secret content. +_render_context() { + cat <<'JSON' +{ + "account": {"id": "acc1", "slug": "acct"}, + "namespace": {"id": "ns1", "slug": "nsps"}, + "application": {"id": "app1", "slug": "appslug"}, + "release": {"semver": "1.0.0"}, + "scope": { + "id": "scope-123", + "slug": "scopeslug", + "domain": "x.example.com", + "dimensions": {"env": "dev"}, + "capabilities": { + "cpu_millicores": 100, + "ram_memory": 128, + "additional_ports": [], + "scaling_type": "fixed", + "autoscaling": { + "min_replicas": 1, + "max_replicas": 3, + "target_cpu_utilization": 80, + "target_memory_enabled": false, + "target_memory_utilization": 80 + }, + "health_check": {"path": "/health", "timeout_seconds": 1, "period_seconds": 5, "initial_delay_seconds": 5} + } + }, + "deployment": {"id": "deploy-456"}, + "k8s_namespace": "ns-test", + "k8s_modifiers": {}, + "asset": {"url": "example.com/app:latest"}, + "main_http_port": 8080, + "traffic_image": "example.com/traffic:latest", + "container_cpu_in_millicores": 50, + "container_memory_in_memory": 64, + "pull_secrets": {"ENABLED": false, "SECRETS": []}, + "region": "us-east-1", + "component": "app", + "service_account_name": "", + "traffic_manager_config_map": "", + "pdb_enabled": "false", + "pdb_max_unavailable": "25%", + "parameters": { + "results": [ + {"type": "environment", "variable": "MY_VAR", "values": [{"value": "hello"}]}, + {"type": "file", "name": "API P12 Cert!", "destination_path": "/app-data/[2026-05-27] cert.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} + ] + } +} +JSON +} + +@test "build_deployment: file-type parameter splits binary into a separate Secret" { + unset -f gomplate # use the real gomplate binary, not the setup mock + + export CONTEXT="$(_render_context)" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + local secret_file="$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" + local deploy_file="$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" + + assert_file_exists "$secret_file" + assert_file_exists "$secret_files_file" + assert_file_exists "$deploy_file" + + # The env-var Secret MUST NOT contain anything that pulls in binary content + # via envFrom. Both app-data-* and app-file-* keys are forbidden here. + ! grep -E 'app-(data|file)-' "$secret_file" + + # Param name "API P12 Cert!" sanitizes to api-p12-cert (lowercase, runs of + # non-alphanumeric collapse to '-', leading/trailing '-' trimmed). The same + # token is reused as env name suffix, Secret data key, and volume name. + assert_contains "$(cat "$secret_files_file")" "name: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$secret_files_file")" "app-file-api-p12-cert: QUFBQkJC" + ! grep -E 'app-data-' "$secret_files_file" + + # The deployment exposes the destination path to the app via a plain `env:` + # entry on the application container (not via any Secret) — no NUL bytes, + # and the env var name is derived from the parameter's display name. + assert_contains "$(cat "$deploy_file")" "- name: app-data-api-p12-cert" + # The path starts with `[`, which YAML parses as a flow sequence unless the + # value is quoted. mountPath, subPath, path, and the env value must all be + # quoted; otherwise the deployment agent fails with `did not find expected key`. + assert_contains "$(cat "$deploy_file")" 'value: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'mountPath: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'subPath: "[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'path: "[2026-05-27] cert.p12"' + + # The volume mount reads bytes from the files Secret, with key matching the + # one produced by secret-files.yaml.tpl. + assert_contains "$(cat "$deploy_file")" "secretName: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$deploy_file")" "key: app-file-api-p12-cert" +} + +@test "build_deployment: secret-files renders empty when no file params" { + unset -f gomplate + + # Same context as _render_context but with the file-type param removed. + export CONTEXT="$(_render_context | jq '.parameters.results |= map(select(.type != "file"))')" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + # gomplate skips writing the output file when the template renders empty, + # which is the signal to apply_templates (which iterates the OUTPUT_DIR and + # skips zero-byte/missing files) to not create an empty files-Secret in the + # cluster. + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" + [ ! -f "$secret_files_file" ] || [ ! -s "$secret_files_file" ] +} diff --git a/k8s/deployment/tests/ingress_template_shape.bats b/k8s/deployment/tests/ingress_template_shape.bats new file mode 100644 index 00000000..9f4ea7ed --- /dev/null +++ b/k8s/deployment/tests/ingress_template_shape.bats @@ -0,0 +1,77 @@ +#!/usr/bin/env bats +# ============================================================================= +# Structural tests for the ingress templates. +# Verifies the listen-ports annotation shape per port type without rendering +# templates. Catches regressions like accidentally restoring a hardcoded +# [{"HTTP":80},{"HTTPS":443}] for HTTP additional ports (which would re-shadow +# the main ingress on the same listener). +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + export INITIAL="$PROJECT_ROOT/k8s/deployment/templates/initial-ingress.yaml.tpl" + export BLUE_GREEN="$PROJECT_ROOT/k8s/deployment/templates/blue-green-ingress.yaml.tpl" +} + +# ----------------------------------------------------------------------------- +# Main ingress (the top-level ingress, NOT inside additional_ports loop) +# ----------------------------------------------------------------------------- + +@test "initial-ingress: main ingress listens on HTTP:80 + HTTPS:443" { + # First listen-ports occurrence in the file is the main ingress. + first_listen=$(grep -m 1 "listen-ports" "$INITIAL") + [[ "$first_listen" == *'[{"HTTP":80},{"HTTPS":443}]'* ]] +} + +@test "blue-green-ingress: main ingress listens on HTTP:80 + HTTPS:443 with ssl-redirect" { + first_listen=$(grep -m 1 "listen-ports" "$BLUE_GREEN") + [[ "$first_listen" == *'[{"HTTP":80},{"HTTPS":443}]'* ]] + # ssl-redirect is on the main ingress (only HTTP+HTTPS listeners use it). + grep -q 'ssl-redirect: "443"' "$BLUE_GREEN" +} + +# ----------------------------------------------------------------------------- +# Additional ports — both HTTP and GRPC use HTTPS on their own port +# (CLIEN-739: HTTP additional ports moved from sharing listener 443 to +# opening their own HTTPS listener at .port, matching the GRPC pattern.) +# ----------------------------------------------------------------------------- + +@test "initial-ingress: HTTP additional port branch uses per-port HTTPS listener" { + # Inside the additional_ports loop, the HTTP branch must use [{"HTTPS":{{ .port }}}]. + # The string '[{"HTTPS":{{ .port }}}]' must appear in the file. The string + # '"HTTP":80' must NOT appear inside the additional_ports range — only on + # the main ingress (which is outside the range). + grep -F '[{"HTTPS":{{ .port }}}]' "$INITIAL" | head -1 >/dev/null + # Sanity: there should be exactly two occurrences of [{"HTTPS":{{ .port }}}] + # (one for HTTP branch, one for GRPC branch). + count=$(grep -cF '[{"HTTPS":{{ .port }}}]' "$INITIAL") + [ "$count" -eq 2 ] + # Sanity: there should be exactly one occurrence of [{"HTTP":80},{"HTTPS":443}] + # (the main ingress only — additional ports must not use it). + shared_count=$(grep -cF '[{"HTTP":80},{"HTTPS":443}]' "$INITIAL") + [ "$shared_count" -eq 1 ] +} + +@test "initial-ingress: GRPC additional port uses backend-protocol-version GRPC" { + grep -q 'backend-protocol-version: GRPC' "$INITIAL" +} + +@test "blue-green-ingress: HTTP additional port branch uses per-port HTTPS listener" { + count=$(grep -cF '[{"HTTPS":{{ .port }}}]' "$BLUE_GREEN") + [ "$count" -eq 2 ] + shared_count=$(grep -cF '[{"HTTP":80},{"HTTPS":443}]' "$BLUE_GREEN") + [ "$shared_count" -eq 1 ] +} + +@test "blue-green-ingress: ssl-redirect only present on main ingress (one occurrence)" { + # ssl-redirect: "443" only makes sense when the listener has both HTTP and HTTPS, + # which is the main ingress. Additional HTTP ports use HTTPS-only listeners, + # so they must not carry ssl-redirect. + count=$(grep -cF 'ssl-redirect: "443"' "$BLUE_GREEN") + [ "$count" -eq 1 ] +} + +@test "blue-green-ingress: GRPC additional port uses backend-protocol-version GRPC" { + grep -q 'backend-protocol-version: GRPC' "$BLUE_GREEN" +} diff --git a/k8s/deployment/tests/print_failed_deployment_hints.bats b/k8s/deployment/tests/print_failed_deployment_hints.bats index 14587515..aae55005 100644 --- a/k8s/deployment/tests/print_failed_deployment_hints.bats +++ b/k8s/deployment/tests/print_failed_deployment_hints.bats @@ -25,12 +25,24 @@ setup() { teardown() { unset CONTEXT + unset K8S_NAMESPACE DEPLOYMENT_ID ALL_EVENTS desired ready current + unset -f kubectl 2>/dev/null || true +} + +assert_not_contains() { + local haystack="$1" + local needle="$2" + if [[ "$haystack" == *"$needle"* ]]; then + echo "Expected output to NOT contain: '$needle'" + echo "Actual: '$haystack'" + return 1 + fi } # ============================================================================= -# Hints Display Test +# Generic Hints (no diagnostic context available) # ============================================================================= -@test "print_failed_deployment_hints: displays complete troubleshooting hints" { +@test "print_failed_deployment_hints: displays generic hints when no diagnostic context available" { run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" [ "$status" -eq 0 ] @@ -49,3 +61,430 @@ teardown() { assert_contains "$output" "my-app" assert_contains "$output" "production" } + +# ============================================================================= +# Pod-derived Diagnostics +# ============================================================================= +@test "print_failed_deployment_hints: identifies OOMKilled and skips generic hints" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"running":{}},"lastState":{"terminated":{"reason":"OOMKilled","exitCode":137,"message":"out of memory"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container exceeded its memory limit (512Mi)" + assert_contains "$output" "📋 Detected: OOMKilled on container app (exit 137)" + assert_contains "$output" "📋 Details: out of memory" + assert_contains "$output" "💡 Suggested fix: Increase ram_memory for scope 'my-app'" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies ImagePullBackOff from waiting state without exit code" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"web","state":{"waiting":{"reason":"ImagePullBackOff","message":"manifest unknown"}},"lastState":{}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container image could not be pulled." + assert_contains "$output" "📋 Detected: ImagePullBackOff on container web" + assert_not_contains "$output" "exit " + assert_contains "$output" "📋 Details: manifest unknown" + assert_contains "$output" "💡 Suggested fix: Verify the image name, tag, and registry credentials" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies CrashLoopBackOff and skips generic hints" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"worker","state":{"waiting":{"reason":"CrashLoopBackOff","message":"back-off 5m0s restarting failed container"}},"lastState":{"terminated":{"exitCode":1}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container started and crashed repeatedly." + assert_contains "$output" "📋 Detected: CrashLoopBackOff on container worker" + assert_contains "$output" "💡 Suggested fix: Review application logs for startup errors" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies CreateContainerConfigError and points to secrets/configmaps" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"CreateContainerConfigError","message":"secret \"db-creds\" not found"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container configuration is invalid." + assert_contains "$output" "💡 Suggested fix: Check for missing secrets or configmaps" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies RunContainerError as entrypoint failure" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"RunContainerError"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The container failed to run its entrypoint." + assert_contains "$output" "💡 Suggested fix: Verify the start command" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies ContainerCannotRun as missing binary" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"running":{}},"lastState":{"terminated":{"reason":"ContainerCannotRun","exitCode":127,"message":"exec: \"/app\": no such file"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: The startup binary is missing or not executable" + assert_contains "$output" "📋 Detected: ContainerCannotRun on container app (exit 127)" + assert_contains "$output" "💡 Suggested fix: Rebuild the image" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies FailedMount from ALL_EVENTS" { + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"FailedMount","message":"MountVolume.SetUp failed"}]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: A volume could not be mounted onto the pod." + assert_contains "$output" "💡 Suggested fix: Check that the referenced PVC, secret, or configmap exists" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies FailedCreatePodSandBox from ALL_EVENTS" { + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"FailedCreatePodSandBox","message":"failed to create pod sandbox"}]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: Kubernetes could not create the pod sandbox." + assert_contains "$output" "💡 Suggested fix: Check node health, CNI configuration" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: identifies Unhealthy and references the configured health check path" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "did not pass its health check at /health" + assert_contains "$output" "💡 Suggested fix: Ensure the app listens on port 8080 and returns 2xx on /health" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: enriches Unhealthy with connection-refused detail and targeted fix" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: Get \"http://10.0.0.1:8080/health\": dial tcp 10.0.0.1:8080: connect: connection refused"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # HUMAN_MESSAGE retains the base sentence and appends the translated probe failure + assert_contains "$output" "did not pass its health check at /health" + assert_contains "$output" "Detected: Startup probe" + assert_contains "$output" "not yet listening" + # SUGGESTED_FIX is targeted: tells the user the app is not binding the port + assert_contains "$output" "not listening on port 8080" + # Generic fallback fix must NOT appear + assert_not_contains "$output" "returns 2xx on /health within the readiness window" +} + +@test "print_failed_deployment_hints: enriches Unhealthy with HTTP statuscode detail and targeted fix" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: HTTP probe failed with statuscode: 502"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "Detected: Startup probe" + assert_contains "$output" "HTTP 502" + # SUGGESTED_FIX cites the status code and points to app logs + assert_contains "$output" "responded with HTTP 502" + assert_contains "$output" "inspect application logs" +} + +@test "print_failed_deployment_hints: enriches Unhealthy with timeout detail and targeted fix" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: Get \"http://10.0.0.1:8080/health\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "Detected: Startup probe" + assert_contains "$output" "timed out" + # SUGGESTED_FIX mentions timing knobs + assert_contains "$output" "initialDelaySeconds" +} + +@test "print_failed_deployment_hints: falls back to raw Unhealthy message when translation is impossible" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + # Message does not match any known probe pattern → translate_probe_message returns non-zero. + # The raw text must still be surfaced in the hint instead of being silently dropped. + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"completely unknown probe failure format from a future K8s"}]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # Raw message appears verbatim in the reason line + assert_contains "$output" "completely unknown probe failure format from a future K8s" + # Base sentence is still there + assert_contains "$output" "did not pass its health check at /health" +} + +@test "print_failed_deployment_hints: Unhealthy picks the latest event when multiple are present" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + # Two Warnings: an older 502 and a newer connection-refused. The fix must reflect the newer one. + export ALL_EVENTS='{"items":[ + {"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:10:00Z","message":"Startup probe failed: HTTP probe failed with statuscode: 502"}, + {"type":"Warning","reason":"Unhealthy","lastTimestamp":"2026-05-20T13:13:42Z","message":"Startup probe failed: Get \"http://10.0.0.1:8080/health\": dial tcp: connect: connection refused"} + ]}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # Latest event wins → connection-refused remediation, not the older HTTP 502 one + assert_contains "$output" "not listening on port 8080" + assert_not_contains "$output" "responded with HTTP 502" +} + +# ============================================================================= +# CONTEXT fallback handling +# ============================================================================= +@test "print_failed_deployment_hints: OOMKilled without ram_memory does not leave dangling (Mi)" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + # CONTEXT present but no ram_memory capability — plausible if the scope did not define memory. + export CONTEXT='{"scope":{"name":"my-app","dimensions":"prod","capabilities":{"health_check":{"path":"/health"}}}}' + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","lastState":{"terminated":{"reason":"OOMKilled","exitCode":137}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "exceeded its memory limit" + # The (Mi) parenthetical must not appear empty when ram_memory is missing. + assert_not_contains "$output" "(Mi)" +} + +@test "print_failed_deployment_hints: applies CONTEXT defaults gracefully when CONTEXT is unset" { + # Drop the bats-provided CONTEXT so we exercise the ${CONTEXT:-{}} fallback. + unset CONTEXT + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"api","state":{"waiting":{"reason":"Unhealthy"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + # health_check_path default "/" must apply when CONTEXT is unset. + assert_contains "$output" "health check at /." + assert_contains "$output" "returns 2xx on /" + # Guard against the previous escape bug: a literal backslash in the message + # would indicate jq received {\} instead of {} and silently failed. + assert_not_contains "$output" "{\\" +} + +# ============================================================================= +# Unknown Reason → falls through to generic checklist +# ============================================================================= +@test "print_failed_deployment_hints: unknown reason still prints generic hints alongside specific reason" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"WeirdNewError"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: Pods are failing with reason: WeirdNewError" + assert_contains "$output" "📋 Detected: WeirdNewError on container app" + # No suggested fix → fall through to generic checklist. + assert_not_contains "$output" "💡 Suggested fix:" + assert_contains "$output" "âš ī¸ Application Startup Issue Detected" + assert_contains "$output" "🔧 How to fix:" +} + +# ============================================================================= +# Event-derived Diagnostics (no pods to inspect) +# ============================================================================= +@test "print_failed_deployment_hints: derives FailedScheduling from ALL_EVENTS when pods unavailable" { + export ALL_EVENTS='{"items":[{"type":"Warning","reason":"FailedScheduling"},{"type":"Warning","reason":"FailedScheduling"}]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Reason: No node has enough resources" + assert_contains "$output" "📋 Detected: FailedScheduling" + assert_contains "$output" "💡 Suggested fix: Reduce requested resources" + assert_not_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "print_failed_deployment_hints: shows top warning event reasons summary" { + export ALL_EVENTS='{"items":[ + {"type":"Warning","reason":"BackOff"}, + {"type":"Warning","reason":"BackOff"}, + {"type":"Warning","reason":"BackOff"}, + {"type":"Warning","reason":"FailedMount"}, + {"type":"Warning","reason":"FailedMount"}, + {"type":"Warning","reason":"Unhealthy"}, + {"type":"Normal","reason":"Pulled"} + ]}' + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📋 Recent warnings:" + assert_contains "$output" "BackOff (×3)" + assert_contains "$output" "FailedMount (×2)" + assert_contains "$output" "Unhealthy (×1)" + # Normal events should not be summarized + assert_not_contains "$output" "Pulled (×" +} + +# ============================================================================= +# Replica progress reporting +# ============================================================================= +@test "print_failed_deployment_hints: includes replica progress when desired/ready/current are set" { + export K8S_NAMESPACE="ns" DEPLOYMENT_ID="d1" + export desired=3 ready=1 current=2 + + kubectl() { + case "$*" in + "get pods"*) + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + + [ "$status" -eq 0 ] + assert_contains "$output" "📊 Progress at failure: 1/3 ready, 2/3 available" +} diff --git a/k8s/deployment/tests/translate_probe_message.bats b/k8s/deployment/tests/translate_probe_message.bats new file mode 100644 index 00000000..2ff9de51 --- /dev/null +++ b/k8s/deployment/tests/translate_probe_message.bats @@ -0,0 +1,142 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for deployment/translate_probe_message - K8s probe message parser +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + # Load helpers only (skip the diagnostic main inside the hints script) + PRINT_HINTS_LIB_ONLY=true source "$BATS_TEST_DIRNAME/../print_failed_deployment_hints" + unset PRINT_HINTS_LIB_ONLY +} + +# ----------------------------------------------------------------------------- +# Connection refused +# ----------------------------------------------------------------------------- +@test "translate_probe_message: startup probe connection refused with path" { + run translate_probe_message 'Startup probe failed: Get "http://10.15.28.102:8080/health": dial tcp 10.15.28.102:8080: connect: connection refused' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "not yet listening" + assert_contains "$output" "/health" +} + +@test "translate_probe_message: liveness probe connection refused" { + run translate_probe_message 'Liveness probe failed: Get "http://10.0.0.5:8080/ping": dial tcp: connect: connection refused' + + [ "$status" -eq 0 ] + assert_contains "$output" "Liveness probe" + assert_contains "$output" "not yet listening" + assert_contains "$output" "/ping" +} + +# ----------------------------------------------------------------------------- +# HTTP status codes +# ----------------------------------------------------------------------------- +@test "translate_probe_message: startup probe HTTP 502" { + run translate_probe_message 'Startup probe failed: HTTP probe failed with statuscode: 502' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "HTTP 502" +} + +@test "translate_probe_message: readiness probe HTTP 404" { + run translate_probe_message 'Readiness probe failed: HTTP probe failed with statuscode: 404' + + [ "$status" -eq 0 ] + assert_contains "$output" "Readiness probe" + assert_contains "$output" "HTTP 404" +} + +# ----------------------------------------------------------------------------- +# Timeout +# ----------------------------------------------------------------------------- +@test "translate_probe_message: startup probe timeout" { + run translate_probe_message 'Startup probe failed: Get "http://10.0.0.5:8080/health": context deadline exceeded (Client.Timeout exceeded while awaiting headers)' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "timed out" + assert_contains "$output" "/health" +} + +# ----------------------------------------------------------------------------- +# Non-probe messages +# ----------------------------------------------------------------------------- +@test "translate_probe_message: returns non-zero for non-probe messages" { + run translate_probe_message 'Failed to pull image "nginx:latest"' + + [ "$status" -ne 0 ] + [ -z "$output" ] +} + +@test "translate_probe_message: returns non-zero for empty input" { + run translate_probe_message '' + + [ "$status" -ne 0 ] +} + +# ----------------------------------------------------------------------------- +# Fallback for unknown probe failure shapes +# ----------------------------------------------------------------------------- +@test "translate_probe_message: generic fallback when probe failure mode is unrecognized" { + run translate_probe_message 'Startup probe failed: some weird new error format' + + [ "$status" -eq 0 ] + assert_contains "$output" "Startup probe" +} + +# ----------------------------------------------------------------------------- +# parse_probe_message — structured output for consolidation +# ----------------------------------------------------------------------------- +@test "parse_probe_message: emits pipe-separated kind, path, mode for connection refused" { + run parse_probe_message 'Startup probe failed: Get "http://10.0.0.1:8080/health": dial tcp: connect: connection refused' + + [ "$status" -eq 0 ] + [ "$output" = "Startup|/health|not yet listening" ] +} + +@test "parse_probe_message: emits 'responded HTTP ' mode with empty path field preserved" { + run parse_probe_message 'Startup probe failed: HTTP probe failed with statuscode: 502' + + [ "$status" -eq 0 ] + # Empty path between two pipes must be preserved so callers can read 3 fields. + # Mode reads as a verb so it composes inline with other modes in one sentence. + [ "$output" = "Startup||responded HTTP 502 (expected 2xx)" ] +} + +@test "parse_probe_message: returns non-zero for non-probe input" { + run parse_probe_message 'Failed to pull image' + [ "$status" -ne 0 ] +} + +# ----------------------------------------------------------------------------- +# short_pod_name — strip K8S_DEPLOYMENT_NAME prefix +# ----------------------------------------------------------------------------- +@test "short_pod_name: strips deployment prefix and marks truncation with '...'" { + K8S_DEPLOYMENT_NAME="d-326230662-1916903584" + run short_pod_name "d-326230662-1916903584-8578df9b4c-hhshq" + + [ "$status" -eq 0 ] + # Leading '...' tells the operator the name was shortened + [ "$output" = "...8578df9b4c-hhshq" ] +} + +@test "short_pod_name: returns full name when prefix env is unset" { + unset K8S_DEPLOYMENT_NAME + run short_pod_name "some-pod-name-abc" + + [ "$status" -eq 0 ] + [ "$output" = "some-pod-name-abc" ] +} + +@test "short_pod_name: returns full name when pod does not match the prefix" { + K8S_DEPLOYMENT_NAME="d-1-2" + run short_pod_name "unrelated-pod-xyz" + + [ "$status" -eq 0 ] + [ "$output" = "unrelated-pod-xyz" ] +} diff --git a/k8s/deployment/tests/validate_alb_target_group_capacity.bats b/k8s/deployment/tests/validate_alb_target_group_capacity.bats index 08d1f28c..3ecd2e89 100644 --- a/k8s/deployment/tests/validate_alb_target_group_capacity.bats +++ b/k8s/deployment/tests/validate_alb_target_group_capacity.bats @@ -15,6 +15,7 @@ setup() { export ALB_NAME="k8s-nullplatform-internet-facing" export REGION="us-east-1" export ALB_MAX_TARGET_GROUPS="98" + export ALB_MAX_LISTENERS="48" export DNS_TYPE="route53" # Base CONTEXT @@ -22,7 +23,7 @@ setup() { "providers": {} }' - # Mock aws - default: ALB with 40 target groups + # Mock aws - default: ALB with 40 target groups and 10 listeners aws() { case "$*" in *"describe-load-balancers"*) @@ -33,6 +34,10 @@ setup() { echo "40" return 0 ;; + *"describe-listeners"*) + echo "10" + return 0 + ;; esac } export -f aws @@ -258,6 +263,10 @@ teardown() { echo "0" return 0 ;; + *"describe-listeners"*) + echo "10" + return 0 + ;; esac } export -f aws @@ -280,6 +289,10 @@ teardown() { echo "97" return 0 ;; + *"describe-listeners"*) + echo "10" + return 0 + ;; esac } export -f aws @@ -382,3 +395,220 @@ teardown() { assert_equal "$status" "0" assert_contains "$output" "🔍 Validating ALB target group capacity for 'k8s-nullplatform-internet-facing'..." } + +# ============================================================================= +# Listener capacity (CLIEN-739) +# ============================================================================= +@test "validate_alb_target_group_capacity: success message includes listener capacity" { + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 10 listeners (max: 48)" + assert_contains "$output" "✅ ALB listener capacity validated: 10/48" +} + +@test "validate_alb_target_group_capacity: fails when listener count is at capacity" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "48" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 48/48" + assert_contains "$output" "💡 Possible causes:" + assert_contains "$output" "Too many scopes with additional_ports are attached to this ALB" + assert_contains "$output" "🔧 How to fix:" + assert_contains "$output" "Reduce additional_ports across scopes sharing this ALB" + assert_contains "$output" "Increase ALB_MAX_LISTENERS in values.yaml or scope-configurations provider (AWS limit is 50)" + assert_contains "$output" "Request an AWS service quota increase for listeners per ALB" +} + +@test "validate_alb_target_group_capacity: fails when listener count is over capacity" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "50" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 50/48" +} + +@test "validate_alb_target_group_capacity: passes at exactly one below listener capacity" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "47" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "✅ ALB listener capacity validated: 47/48" +} + +@test "validate_alb_target_group_capacity: handles zero listeners" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "0" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 0 listeners (max: 48)" + assert_contains "$output" "✅ ALB listener capacity validated: 0/48" +} + +@test "validate_alb_target_group_capacity: uses default ALB_MAX_LISTENERS of 48" { + unset ALB_MAX_LISTENERS + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "0" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 10 listeners (max: 48)" +} + +@test "validate_alb_target_group_capacity: ALB_MAX_LISTENERS from env var" { + export ALB_MAX_LISTENERS="5" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 10/5" +} + +@test "validate_alb_target_group_capacity: ALB_MAX_LISTENERS from scope-configurations provider" { + export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_listeners":"5"}}}}' + export ALB_MAX_LISTENERS="48" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 10/5" +} + +@test "validate_alb_target_group_capacity: ALB_MAX_LISTENERS from container-orchestration provider" { + export CONTEXT='{"providers":{"container-orchestration":{"balancer":{"alb_max_listeners":"5"}}}}' + export ALB_MAX_LISTENERS="48" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached listener capacity: 10/5" +} + +@test "validate_alb_target_group_capacity: fails when describe-listeners fails" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "Access Denied" >&2 + return 1 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ Failed to describe listeners for ALB 'k8s-nullplatform-internet-facing'" + assert_contains "$output" "Check IAM permissions for elbv2:DescribeListeners" +} + +@test "validate_alb_target_group_capacity: fails when listener count is non-numeric" { + aws() { + case "$*" in + *"describe-load-balancers"*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *"describe-target-groups"*) + echo "40" + return 0 + ;; + *"describe-listeners"*) + echo "WARNING: unexpected" + return 0 + ;; + esac + } + export -f aws + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ Unexpected non-numeric listener count from ALB" + assert_contains "$output" "📋 Received value: WARNING: unexpected" +} + +@test "validate_alb_target_group_capacity: fails when ALB_MAX_LISTENERS is non-numeric" { + export ALB_MAX_LISTENERS="abc" + + run bash -c 'source "$SCRIPT"' + + assert_equal "$status" "1" + assert_contains "$output" "❌ ALB_MAX_LISTENERS must be a numeric value, got: 'abc'" +} diff --git a/k8s/deployment/tests/verify_ingress_reconciliation.bats b/k8s/deployment/tests/verify_ingress_reconciliation.bats index 1e216f96..7371dda6 100644 --- a/k8s/deployment/tests/verify_ingress_reconciliation.bats +++ b/k8s/deployment/tests/verify_ingress_reconciliation.bats @@ -244,7 +244,7 @@ teardown() { assert_contains "$output" "📋 ALB validation enabled: k8s-test-alb for domain app.example.com" assert_contains "$output" "📝 Checking domain: app.example.com" assert_contains "$output" "✅ Found rule for domain: app.example.com" - assert_contains "$output" "❌ Weights mismatch on listener port 443: expected=50/50 actual=20/80" + assert_contains "$output" "❌ Weights mismatch on listener port 443: expected=50 actual=20/80" } @test "verify_ingress_reconciliation: skips weight check on additional port listener when blue has no service" { @@ -291,6 +291,82 @@ teardown() { assert_contains "$output" "✅ ALB configuration validated successfully" } +@test "verify_ingress_reconciliation: passes when multiple rules on same listener share expected weights (CLIEN-739)" { + # Scenario: scope has main + additional HTTP port ingresses sharing the ALB listener. + # Both rules match the same host-header and each carries blue/green target groups with + # the same blue-green split (90/10). The pre-dedupe extractor returned "10/10/90/90" + # and falsely failed against expected "10/90". Dedupe makes the comparison correct. + local ctx='{"scope":{"slug":"my-app","domain":"app.example.com","current_active_deployment":"deploy-old"},"alb_name":"k8s-test-alb","deployment":{"strategy":"blue_green","strategy_data":{"desired_switched_traffic":10}}}' + + run bash -c " + kubectl() { + echo '{\"metadata\": {\"resourceVersion\": \"12345\"}}' + return 0 + } + aws() { + case \"\$2\" in + describe-load-balancers) + echo 'arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/test-alb/abc123' + ;; + describe-listeners) + echo '{\"Listeners\":[{\"ListenerArn\":\"arn:aws:listener/443\",\"Port\":443}]}' + ;; + describe-rules) + echo '{\"Rules\":[{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":90},{\"Weight\":10}]}}]},{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":90},{\"Weight\":10}]}}]}]}' + ;; + esac + return 0 + } + export -f kubectl aws + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export MAX_WAIT_SECONDS='1' CHECK_INTERVAL='1' + export ALB_RECONCILIATION_ENABLED='true' VERIFY_WEIGHTS='true' REGION='$REGION' + export CONTEXT='$ctx' + source '$BATS_TEST_DIRNAME/../verify_ingress_reconciliation' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "✅ Weights match on listener port 443" + assert_contains "$output" "✅ ALB configuration validated successfully" +} + +@test "verify_ingress_reconciliation: detects mismatch when one rule diverges from expected (CLIEN-739)" { + # Scenario: main rule has correct 90/10 split, additional port rule has wrong 50/50 split. + # After dedupe the unique values become 10/50/90, which does not match expected 10/90. + # Confirms that dedupe still surfaces real misconfigurations across multiple rules. + local ctx='{"scope":{"slug":"my-app","domain":"app.example.com","current_active_deployment":"deploy-old"},"alb_name":"k8s-test-alb","deployment":{"strategy":"blue_green","strategy_data":{"desired_switched_traffic":10}}}' + + run bash -c " + kubectl() { + echo '{\"metadata\": {\"resourceVersion\": \"12345\"}}' + return 0 + } + aws() { + case \"\$2\" in + describe-load-balancers) + echo 'arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/test-alb/abc123' + ;; + describe-listeners) + echo '{\"Listeners\":[{\"ListenerArn\":\"arn:aws:listener/443\",\"Port\":443}]}' + ;; + describe-rules) + echo '{\"Rules\":[{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":90},{\"Weight\":10}]}}]},{\"Conditions\":[{\"Field\":\"host-header\",\"Values\":[\"app.example.com\"]}],\"Actions\":[{\"Type\":\"forward\",\"ForwardConfig\":{\"TargetGroups\":[{\"Weight\":50},{\"Weight\":50}]}}]}]}' + ;; + esac + return 0 + } + export -f kubectl aws + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export MAX_WAIT_SECONDS='1' CHECK_INTERVAL='1' + export ALB_RECONCILIATION_ENABLED='true' VERIFY_WEIGHTS='true' REGION='$REGION' + export CONTEXT='$ctx' + source '$BATS_TEST_DIRNAME/../verify_ingress_reconciliation' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "❌ Weights mismatch on listener port 443: expected=10/90 actual=10/50/90" +} + @test "verify_ingress_reconciliation: detects domain not found in ALB rules" { run bash -c " kubectl() { diff --git a/k8s/deployment/tests/verify_networking_reconciliation.bats b/k8s/deployment/tests/verify_networking_reconciliation.bats index 7972e07e..424a0e10 100644 --- a/k8s/deployment/tests/verify_networking_reconciliation.bats +++ b/k8s/deployment/tests/verify_networking_reconciliation.bats @@ -44,6 +44,32 @@ teardown() { assert_contains "$output" "âš ī¸ Skipping ALB verification (ALB access needed for blue-green traffic validation)" } +@test "verify_networking_reconciliation: verifies HTTPRoute for external_dns without managing DNS" { + export DNS_TYPE="external_dns" + export SCOPE_ID="123" + export K8S_NAMESPACE="nullplatform" + export INGRESS_VISIBILITY="public" + export MAX_WAIT_SECONDS="10" + export CHECK_INTERVAL="10" + export CONTEXT='{"scope":{"slug":"my-app","id":"123","domain":"app.example.com"}}' + + run bash -c " + kubectl() { + echo '{\"status\":{\"parents\":[{\"conditions\":[{\"type\":\"Accepted\",\"status\":\"True\",\"reason\":\"Accepted\"},{\"type\":\"ResolvedRefs\",\"status\":\"True\",\"reason\":\"ResolvedRefs\"}]}]}}' + return 0 + } + export -f kubectl + sleep() { return 0; } + export -f sleep + source '$BATS_TEST_DIRNAME/../verify_networking_reconciliation' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "🔍 Verifying networking reconciliation for DNS type: external_dns" + assert_contains "$output" "🔍 Verifying HTTPRoute reconciliation..." + assert_contains "$output" "✅ HTTPRoute successfully reconciled" +} + @test "verify_networking_reconciliation: skips for unsupported DNS types" { export DNS_TYPE="unknown" diff --git a/k8s/deployment/tests/wait_deployment_active.bats b/k8s/deployment/tests/wait_deployment_active.bats index 5983ec19..c1061ac2 100644 --- a/k8s/deployment/tests/wait_deployment_active.bats +++ b/k8s/deployment/tests/wait_deployment_active.bats @@ -114,6 +114,42 @@ teardown() { assert_contains "$output" "📋 Timeout: 5s (max 0 iterations)" assert_contains "$output" "❌ Timeout waiting for deployment" assert_contains "$output" "📋 Maximum iterations (0) reached" + # Timeout path must source print_failed_deployment_hints; with no pod info + # and no events, it falls through to the generic checklist. + assert_contains "$output" "âš ī¸ Application Startup Issue Detected" +} + +@test "wait_deployment_active: surfaces specific failure reason on timeout when pod info is available" { + export TIMEOUT=5 + + kubectl() { + case "$*" in + "get deployment d-scope-123-deploy-456 -n test-namespace -o json") + echo '{"spec":{"replicas":3},"status":{"availableReplicas":0,"updatedReplicas":0,"readyReplicas":0}}' + ;; + "get pods -n test-namespace -l deployment_id=deploy-456 -o json") + echo '{"items":[{"status":{"containerStatuses":[{"name":"app","state":{"running":{}},"lastState":{"terminated":{"reason":"OOMKilled","exitCode":137,"message":"out of memory"}}}]}}]}' + ;; + "get pods"*) + echo "" + ;; + "get events"*) + echo '{"items":[]}' + ;; + esac + } + export -f kubectl + + export CONTEXT='{"scope":{"name":"my-app","dimensions":"prod","capabilities":{"health_check":{"path":"/health"},"ram_memory":512}}}' + + run bash "$BATS_TEST_DIRNAME/../wait_deployment_active" + + [ "$status" -eq 1 ] + assert_contains "$output" "❌ Timeout waiting for deployment" + # The hint script must read pod state and surface the user-friendly reason + assert_contains "$output" "📋 Reason: The container exceeded its memory limit" + assert_contains "$output" "📋 Detected: OOMKilled on container app (exit 137)" + assert_contains "$output" "💡 Suggested fix: Increase ram_memory for scope 'my-app'" } # ============================================================================= @@ -159,6 +195,8 @@ teardown() { [ "$status" -eq 1 ] assert_contains "$output" "❌ Deployment is no longer running (status: failed)" + # Non-running status path must also source print_failed_deployment_hints + assert_contains "$output" "âš ī¸ Application Startup Issue Detected" } # ============================================================================= @@ -221,6 +259,7 @@ teardown() { [ "$status" -eq 1 ] assert_contains "$output" "Deployment status - Available: 3/5, Updated: 4/5, Ready: 3/5" + assert_contains "$output" "âŗ Still waiting — Ready: 3/5, Available: 3/5 (attempt 1/1, 10s elapsed)" assert_contains "$output" "❌ Timeout waiting for deployment" } @@ -345,3 +384,322 @@ teardown() { [ "$status" -eq 0 ] assert_equal "$output" "6" } + +# ============================================================================= +# Heartbeat Tests +# ============================================================================= +@test "wait_deployment_active: logs heartbeat every 10% of timeout with progress info" { + # TIMEOUT=100 -> MAX_ITERATIONS=10 -> HEARTBEAT_INTERVAL=1 (every iteration) + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":2},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods\"*) echo '' ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=100 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # First heartbeat always at iteration 1 + assert_contains "$output" "âŗ Still waiting — Ready: 0/2, Available: 0/2 (attempt 1/10, 10s elapsed)" + # Mid-progress + assert_contains "$output" "(attempt 5/10, 50s elapsed)" + # Last iteration before timeout + assert_contains "$output" "(attempt 10/10, 100s elapsed)" +} + +@test "wait_deployment_active: heartbeat interval clamps to >=1 for short timeouts" { + # TIMEOUT=30 -> MAX_ITERATIONS=3 -> HEARTBEAT_INTERVAL would be 0, must clamp to 1 + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods\"*) echo '' ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=30 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # All three iterations should emit a heartbeat (interval clamped to 1) + assert_contains "$output" "(attempt 1/3, 10s elapsed)" + assert_contains "$output" "(attempt 2/3, 20s elapsed)" + assert_contains "$output" "(attempt 3/3, 30s elapsed)" +} + +@test "wait_deployment_active: heartbeat is suppressed when deployment is ready on iteration 1" { + # Default mocks: deployment is ready immediately, so heartbeat should NOT fire. + run bash "$BATS_TEST_DIRNAME/../wait_deployment_active" + + [ "$status" -eq 0 ] + assert_contains "$output" "✅ All pods in deployment 'd-scope-123-deploy-456' are available and ready!" + # No heartbeat emitted because the ready-check breaks before it + if [[ "$output" == *"Still waiting"* ]]; then + echo "Expected output to NOT contain 'Still waiting' on success path" + echo "Actual: $output" + return 1 + fi +} + +# ============================================================================= +# Unhealthy Translation Tests +# ============================================================================= +@test "wait_deployment_active: translates Unhealthy connection-refused into human line during polling" { + # Use a far-future timestamp so the event is not filtered out by the now() initialization. + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[{\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: Get \\\"http://10.0.0.1:8080/health\\\": dial tcp 10.0.0.1:8080: connect: connection refused\"}]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # Translated form must appear + assert_contains "$output" "Startup probe" + assert_contains "$output" "not yet listening" + assert_contains "$output" "/health" + # Raw connection-refused text must NOT leak through + if [[ "$output" == *"connection refused"* ]]; then + echo "Expected output to NOT contain raw 'connection refused' (should be translated)" + echo "Actual: $output" + return 1 + fi +} + +@test "wait_deployment_active: consolidates multiple Unhealthy events for same pod into a single line" { + # The kubelet often emits two Unhealthy events per probe round (connection refused + # + HTTP 502 from a sidecar). The polling loop must merge them into one log line + # with both failure modes combined, using the short pod name. + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc-hhshq' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[ + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: Get \\\"http://10.0.0.1:8080/health\\\": dial tcp: connect: connection refused\"}, + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: HTTP probe failed with statuscode: 502\"} + ]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # One consolidated line with both modes joined by ', ' for natural reading + assert_contains "$output" "Startup probe failing on /health — not yet listening, responded HTTP 502 (expected 2xx)" + # Pod name must be the short form with '...' prefix marking truncation + assert_contains "$output" "Pod/...abc-hhshq" + # The long prefix must NOT appear in any logged event line + if [[ "$output" == *"Pod/d-scope-123-deploy-456-abc-hhshq"* ]]; then + echo "Expected output to use short pod name, not the full prefix" + echo "Actual: $output" + return 1 + fi + # And we must see only ONE consolidated line, not one per mode. + local lines + lines=$(printf '%s\n' "$output" | grep -c "Startup probe failing" || true) + if [ "$lines" -ne 1 ]; then + echo "Expected exactly 1 consolidated 'Startup probe failing' line, got $lines" + echo "Actual: $output" + return 1 + fi +} + +@test "wait_deployment_active: falls back to raw messages when parse_probe_message cannot translate" { + # Two Unhealthy events whose messages do NOT match any known probe pattern. + # Consolidation must fail and the raw text must be preserved for the operator. + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc-hhshq' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[ + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"some brand-new K8s probe format we cannot parse 1\"}, + {\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc-hhshq\"},\"reason\":\"Unhealthy\",\"message\":\"another unknown probe format 2\"} + ]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + # Both original messages must appear verbatim + assert_contains "$output" "some brand-new K8s probe format we cannot parse 1" + assert_contains "$output" "another unknown probe format 2" + # The consolidated header must NOT appear because parsing failed + if [[ "$output" == *"probe failing"* ]]; then + echo "Expected fallback path to NOT emit the 'probe failing' header" + echo "Actual: $output" + return 1 + fi +} + +@test "wait_deployment_active: translates Unhealthy HTTP statuscode into human line during polling" { + run bash -c " + sleep() { :; } + export -f sleep + + kubectl() { + case \"\$*\" in + \"get deployment\"*\"-o json\"*) + echo '{\"spec\":{\"replicas\":1},\"status\":{\"availableReplicas\":0,\"updatedReplicas\":0,\"readyReplicas\":0}}' + ;; + \"get pods -n test-namespace -l deployment_id=deploy-456 -o jsonpath\"*) + echo 'd-scope-123-deploy-456-abc' + ;; + \"get events\"*\"Pod\"*) + echo '{\"items\":[{\"lastTimestamp\":\"9999-12-31T23:59:59Z\",\"type\":\"Warning\",\"involvedObject\":{\"kind\":\"Pod\",\"name\":\"d-scope-123-deploy-456-abc\"},\"reason\":\"Unhealthy\",\"message\":\"Startup probe failed: HTTP probe failed with statuscode: 502\"}]}' + ;; + \"get events\"*) echo '{\"items\":[]}' ;; + esac + } + export -f kubectl + + np() { echo 'running'; } + export -f np + + export SERVICE_PATH='$SERVICE_PATH' K8S_NAMESPACE='$K8S_NAMESPACE' + export SCOPE_ID='$SCOPE_ID' DEPLOYMENT_ID='$DEPLOYMENT_ID' + export TIMEOUT=10 NP_API_KEY='$NP_API_KEY' SKIP_DEPLOYMENT_STATUS_CHECK='false' + bash '$BATS_TEST_DIRNAME/../wait_deployment_active' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "Startup probe" + assert_contains "$output" "HTTP 502" +} + +# ============================================================================= +# Latest Timestamp Initialization +# ============================================================================= +@test "wait_deployment_active: skips K8s events older than script start time" { + # An event from 2020 must be filtered out because LATEST_TIMESTAMP is initialized + # to now() — prevents stale events from previous workflow retries leaking through. + kubectl() { + case "$*" in + "get deployment"*"-o json"*) + echo '{ + "spec": {"replicas": 3}, + "status": { + "availableReplicas": 3, + "updatedReplicas": 3, + "readyReplicas": 3 + } + }' + ;; + "get pods"*) + echo "" + ;; + "get events"*"Deployment"*) + # A very old event that should be suppressed + echo '{"items":[{"effectiveTimestamp":"2020-01-01T00:00:00Z","type":"Warning","involvedObject":{"kind":"Pod","name":"d-scope-123-deploy-456-abc"},"reason":"Unhealthy","message":"old stale warning"}]}' + ;; + "get events"*) + echo '{"items":[]}' + ;; + esac + } + export -f kubectl + + run bash "$BATS_TEST_DIRNAME/../wait_deployment_active" + + [ "$status" -eq 0 ] + # The 2020 event must not appear in output + if [[ "$output" == *"old stale warning"* ]]; then + echo "Expected output to NOT contain stale 2020 warning" + echo "Actual: $output" + return 1 + fi + if [[ "$output" == *"2020-01-01T00:00:00Z"* ]]; then + echo "Expected output to NOT contain stale 2020 timestamp" + echo "Actual: $output" + return 1 + fi +} diff --git a/k8s/deployment/validate_alb_target_group_capacity b/k8s/deployment/validate_alb_target_group_capacity index 9b3fc8de..71d01d9e 100755 --- a/k8s/deployment/validate_alb_target_group_capacity +++ b/k8s/deployment/validate_alb_target_group_capacity @@ -112,3 +112,75 @@ if [[ "$TARGET_GROUP_COUNT" -ge "$ALB_MAX_TARGET_GROUPS" ]]; then fi log info "✅ ALB target group capacity validated: $TARGET_GROUP_COUNT/$ALB_MAX_TARGET_GROUPS" + +# Listener capacity validation (CLIEN-739): each additional_port HTTP/GRPC +# opens its own ALB listener, so this constraint can hit before the target +# group quota in scopes with many additional ports. +ALB_MAX_LISTENERS=$(get_config_value \ + --env ALB_MAX_LISTENERS \ + --provider '.providers["scope-configurations"].networking.alb_max_listeners' \ + --provider '.providers["container-orchestration"].balancer.alb_max_listeners' \ + --default "48" +) + +if ! [[ "$ALB_MAX_LISTENERS" =~ ^[0-9]+$ ]]; then + log error "❌ ALB_MAX_LISTENERS must be a numeric value, got: '$ALB_MAX_LISTENERS'" + log error "" + log error "🔧 How to fix:" + log error " â€ĸ Set a numeric value in values.yaml or scope-configurations provider" + log error "" + exit 1 +fi + +log debug "📋 ALB: $ALB_NAME | Max listeners: $ALB_MAX_LISTENERS" + +LISTENER_COUNT=$(aws elbv2 describe-listeners \ + --load-balancer-arn "$ALB_ARN" \ + --region "$REGION" \ + --query 'length(Listeners)' \ + --output text \ + --no-paginate 2>&1) || { + log error "❌ Failed to describe listeners for ALB '$ALB_NAME'" + log error "" + log error "💡 Possible causes:" + log error " The agent may lack permissions to describe listeners" + log error "" + log error "🔧 How to fix:" + log error " â€ĸ Check IAM permissions for elbv2:DescribeListeners" + log error "" + exit 1 +} + +if ! [[ "$LISTENER_COUNT" =~ ^[0-9]+$ ]]; then + log error "❌ Unexpected non-numeric listener count from ALB" + log error "📋 ALB ARN: $ALB_ARN" + log error "📋 Received value: $LISTENER_COUNT" + log error "" + log error "💡 Possible causes:" + log error " The AWS CLI returned an unexpected response format" + log error "" + log error "🔧 How to fix:" + log error " â€ĸ Verify AWS CLI version and credentials are correct" + log error " â€ĸ Run manually: aws elbv2 describe-listeners --load-balancer-arn $ALB_ARN --region $REGION --query 'length(Listeners)'" + log error "" + exit 1 +fi + +log info "📋 ALB '$ALB_NAME' has $LISTENER_COUNT listeners (max: $ALB_MAX_LISTENERS)" + +if [[ "$LISTENER_COUNT" -ge "$ALB_MAX_LISTENERS" ]]; then + log error "❌ ALB '$ALB_NAME' has reached listener capacity: $LISTENER_COUNT/$ALB_MAX_LISTENERS" + log error "" + log error "💡 Possible causes:" + log error " Too many scopes with additional_ports are attached to this ALB. Each HTTP/GRPC additional port opens its own listener." + log error "" + log error "🔧 How to fix:" + log error " â€ĸ Reduce additional_ports across scopes sharing this ALB" + log error " â€ĸ Increase ALB_MAX_LISTENERS in values.yaml or scope-configurations provider (AWS limit is 50)" + log error " â€ĸ Request an AWS service quota increase for listeners per ALB" + log error " â€ĸ Consider using a separate ALB for additional scopes" + log error "" + exit 1 +fi + +log info "✅ ALB listener capacity validated: $LISTENER_COUNT/$ALB_MAX_LISTENERS" diff --git a/k8s/deployment/verify_ingress_reconciliation b/k8s/deployment/verify_ingress_reconciliation index ee9f3221..12814400 100644 --- a/k8s/deployment/verify_ingress_reconciliation +++ b/k8s/deployment/verify_ingress_reconciliation @@ -156,8 +156,14 @@ validate_alb_config() { GREEN_WEIGHT=$SWITCH_TRAFFIC BLUE_DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.scope.current_active_deployment // empty') + # Dedupe: when a scope has multiple ingresses on the same ALB listener + # (main + additional HTTP ports), the host-header select returns multiple + # rules and the weight extraction concatenates pairs (e.g. 10/10/90/90). + # We compare by the set of unique weights — false negatives if pairs are + # all consistent with expected; mismatches still surface (extra unique + # values appear as soon as any rule's weights diverge). if [ -n "$BLUE_DEPLOYMENT_ID" ]; then - EXPECTED_WEIGHTS=$(printf "%s\n%s" "$BLUE_WEIGHT" "$GREEN_WEIGHT" | sort -n) + EXPECTED_WEIGHTS=$(printf "%s\n%s" "$BLUE_WEIGHT" "$GREEN_WEIGHT" | sort -un) else EXPECTED_WEIGHTS="$GREEN_WEIGHT" fi @@ -167,7 +173,7 @@ validate_alb_config() { select(.Type == "forward") | .ForwardConfig.TargetGroups[]? | "\(.Weight // 1)" - ' 2>/dev/null | sort -n) + ' 2>/dev/null | sort -un) if [ -n "$EXPECTED_WEIGHTS" ] && [ -n "$ACTUAL_WEIGHTS" ]; then if [ "$EXPECTED_WEIGHTS" == "$ACTUAL_WEIGHTS" ]; then diff --git a/k8s/deployment/verify_networking_reconciliation b/k8s/deployment/verify_networking_reconciliation index 214c8530..506e57f5 100644 --- a/k8s/deployment/verify_networking_reconciliation +++ b/k8s/deployment/verify_networking_reconciliation @@ -7,8 +7,10 @@ case "$DNS_TYPE" in route53) source "$SERVICE_PATH/deployment/verify_ingress_reconciliation" ;; + external_dns) + source "$SERVICE_PATH/deployment/verify_http_route_reconciliation" + ;; *) log warn "âš ī¸ Ingress reconciliation not available for DNS type: $DNS_TYPE, skipping" -# source "$SERVICE_PATH/deployment/verify_http_route_reconciliation" ;; esac diff --git a/k8s/deployment/wait_deployment_active b/k8s/deployment/wait_deployment_active index c242b03f..478e4215 100755 --- a/k8s/deployment/wait_deployment_active +++ b/k8s/deployment/wait_deployment_active @@ -1,12 +1,63 @@ #!/bin/bash +# Load probe helpers without firing the diagnostic main of the hints script. +PRINT_HINTS_LIB_ONLY=true source "$SERVICE_PATH/deployment/print_failed_deployment_hints" +unset PRINT_HINTS_LIB_ONLY + +# Try to print one consolidated line for a pod's grouped Unhealthy events. +# Returns non-zero if any message cannot be parsed or if no probe kind was +# detected — callers fall back to log_unhealthy_raw to preserve the original +# text instead of silently dropping events. +log_unhealthy_group() { + local ts="$1" pod_name="$2" messages_concat="$3" + local kind="" path="" modes="" + local msg parsed k p m short path_suffix + + while IFS= read -r msg; do + [ -z "$msg" ] && continue + parsed=$(parse_probe_message "$msg" 2>/dev/null) || return 1 + [ -z "$parsed" ] && return 1 + IFS='|' read -r k p m <<< "$parsed" + [ -n "$k" ] && kind="$k" + [ -n "$p" ] && path="$p" + if [[ "$modes" != *"$m"* ]]; then + if [ -z "$modes" ]; then + modes="$m" + else + modes="$modes, $m" + fi + fi + done < <(printf '%s\n' "$messages_concat" | tr '\001' '\n') + + [ -z "$kind" ] && return 1 + + short=$(short_pod_name "$pod_name" 2>/dev/null) || short="$pod_name" + path_suffix="" + [ -n "$path" ] && path_suffix=" on $path" + log warn "$ts [Warning] Pod/$short ${kind} probe failing${path_suffix} — ${modes}" + return 0 +} + +# Fallback: emit one raw warning line per original message in the group. +log_unhealthy_raw() { + local ts="$1" pod_name="$2" messages_concat="$3" + local msg + while IFS= read -r msg; do + [ -z "$msg" ] && continue + log warn "$ts [Warning] Pod/$pod_name: Unhealthy - $msg" + done < <(printf '%s\n' "$messages_concat" | tr '\001' '\n') +} + MAX_ITERATIONS=$(( TIMEOUT / 10 )) K8S_DEPLOYMENT_NAME="d-$SCOPE_ID-$DEPLOYMENT_ID" iteration=0 -LATEST_TIMESTAMP="" +LATEST_TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") SKIP_DEPLOYMENT_STATUS_CHECK="${SKIP_DEPLOYMENT_STATUS_CHECK:=false}" +HEARTBEAT_INTERVAL=$(( MAX_ITERATIONS / 10 )) +[ "$HEARTBEAT_INTERVAL" -lt 1 ] && HEARTBEAT_INTERVAL=1 + log debug "🔍 Waiting for deployment '$K8S_DEPLOYMENT_NAME' to become active..." log debug "📋 Namespace: $K8S_NAMESPACE" log debug "📋 Timeout: ${TIMEOUT}s (max $MAX_ITERATIONS iterations)" @@ -25,7 +76,7 @@ while true; do fi log debug "📡 Checking deployment status (attempt $iteration/$MAX_ITERATIONS)..." - D_STATUS=$(np deployment read --id $DEPLOYMENT_ID --api-key $NP_API_KEY --query .status 2>&1) || { + D_STATUS=$(np deployment read --id "$DEPLOYMENT_ID" --api-key "$NP_API_KEY" --query .status 2>&1) || { log error " ❌ Failed to read deployment status" log error "📋 NP CLI error: $D_STATUS" exit 1 @@ -39,6 +90,7 @@ while true; do if [ "$SKIP_DEPLOYMENT_STATUS_CHECK" != true ]; then if [[ $D_STATUS != "running" && $D_STATUS != "waiting_for_instances" ]]; then log error " ❌ Deployment is no longer running (status: $D_STATUS)" + source "$SERVICE_PATH/deployment/print_failed_deployment_hints" exit 1 fi fi @@ -60,6 +112,11 @@ while true; do break fi + if [ "$iteration" -eq 1 ] || [ $(( iteration % HEARTBEAT_INTERVAL )) -eq 0 ]; then + elapsed_s=$(( iteration * 10 )) + log info "âŗ Still waiting — Ready: $ready/$desired, Available: $current/$desired (attempt $iteration/$MAX_ITERATIONS, ${elapsed_s}s elapsed)" + fi + POD_SELECTOR="deployment_id=${DEPLOYMENT_ID}" POD_NAMES=$(kubectl get pods -n $K8S_NAMESPACE -l $POD_SELECTOR -o jsonpath='{.items[*].metadata.name}') # Get events for the deployment first @@ -89,26 +146,50 @@ while true; do # Find the newest timestamp in all events NEWEST_TIMESTAMP=$(echo "$PROCESSED_EVENTS" | jq -r '.items | map(.effectiveTimestamp) | max // empty') - # Process events with jq, showing only events newer than what we've seen - # Output format: TYPEmessage — so we can route Warning events to log warn - NEW_EVENTS=$(echo "$PROCESSED_EVENTS" | jq -r --arg timestamp "$LATEST_TIMESTAMP" ' + # Non-Unhealthy events: emit one line each (current behavior). + OTHER_EVENTS=$(echo "$PROCESSED_EVENTS" | jq -r --arg timestamp "$LATEST_TIMESTAMP" ' .items | sort_by(.effectiveTimestamp) | .[] | select($timestamp == "" or (.effectiveTimestamp > $timestamp)) | - "\(.type)\t\(.effectiveTimestamp) [\(.type)] \(.involvedObject.kind)/\(.involvedObject.name): \(.reason) - \(.message)" + select(.reason != "Unhealthy") | + "\(.type)\t\(.effectiveTimestamp)\t\(.involvedObject.kind)\t\(.involvedObject.name)\t\(.reason)\t\((.message // "") | gsub("[\t\n]"; " "))" ') - # If we have new events, show them and update the timestamp - if [ ! -z "$NEW_EVENTS" ]; then - while IFS=$'\t' read -r event_type event_line; do + if [ -n "$OTHER_EVENTS" ]; then + while IFS=$'\t' read -r event_type ts kind name reason message; do + short=$(short_pod_name "$name") + event_line="$ts [$event_type] $kind/$short: $reason - $message" if [ "$event_type" = "Warning" ]; then log warn "$event_line" else log debug "$event_line" fi - done <<< "$NEW_EVENTS" - # Store the newest timestamp for next iteration + done <<< "$OTHER_EVENTS" + fi + + # Unhealthy events: group by pod and consolidate every failure mode for that + # pod into a single line. Messages within a group are joined by U+0001 (SOH), + # a byte that cannot appear in K8s event text. + UNHEALTHY_GROUPS=$(echo "$PROCESSED_EVENTS" | jq -r --arg timestamp "$LATEST_TIMESTAMP" ' + [.items[] + | select($timestamp == "" or (.effectiveTimestamp > $timestamp)) + | select(.reason == "Unhealthy")] + | group_by(.involvedObject.name) + | .[] + | "\((max_by(.effectiveTimestamp)).effectiveTimestamp)\t\(.[0].involvedObject.name)\t\([.[].message] | map(gsub("[\t\n\u0001]"; " ")) | join("\u0001"))" + ') + + if [ -n "$UNHEALTHY_GROUPS" ]; then + while IFS=$'\t' read -r ts pod_name messages_concat; do + [ -z "$pod_name" ] && continue + log_unhealthy_group "$ts" "$pod_name" "$messages_concat" \ + || log_unhealthy_raw "$ts" "$pod_name" "$messages_concat" + done <<< "$UNHEALTHY_GROUPS" + fi + + # Advance cursor if any new events were processed in this iteration. + if [ -n "$OTHER_EVENTS" ] || [ -n "$UNHEALTHY_GROUPS" ]; then LATEST_TIMESTAMP="$NEWEST_TIMESTAMP" log debug "Updated timestamp to: $LATEST_TIMESTAMP" fi diff --git a/k8s/deployment/workflows/diagnose.yaml b/k8s/deployment/workflows/diagnose.yaml index 66223726..45d837c3 100644 --- a/k8s/deployment/workflows/diagnose.yaml +++ b/k8s/deployment/workflows/diagnose.yaml @@ -34,4 +34,5 @@ steps: folders: - "$SERVICE_PATH/diagnose/service" - "$SERVICE_PATH/diagnose/scope" - - "$SERVICE_PATH/diagnose/networking" \ No newline at end of file + - "$SERVICE_PATH/diagnose/networking" + - "$SERVICE_PATH/diagnose/logs" \ No newline at end of file diff --git a/k8s/diagnose/build_context b/k8s/diagnose/build_context index 8ec7e8dc..1459cca4 100755 --- a/k8s/diagnose/build_context +++ b/k8s/diagnose/build_context @@ -29,6 +29,16 @@ PODS_FILE="$DATA_DIR/pods.json" kubectl get pods -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$PODS_FILE" || echo '{"items":[]}' > "$PODS_FILE" export PODS_FILE +# Deployments +DEPLOYMENTS_FILE="$DATA_DIR/deployments.json" +kubectl get deployment -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$DEPLOYMENTS_FILE" || echo '{"items":[]}' > "$DEPLOYMENTS_FILE" +export DEPLOYMENTS_FILE + +# ReplicaSets +REPLICASETS_FILE="$DATA_DIR/replicasets.json" +kubectl get rs -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$REPLICASETS_FILE" || echo '{"items":[]}' > "$REPLICASETS_FILE" +export REPLICASETS_FILE + # Services SERVICES_FILE="$DATA_DIR/services.json" kubectl get services -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$SERVICES_FILE" || echo '{"items":[]}' > "$SERVICES_FILE" @@ -82,4 +92,79 @@ if [[ -n "$ALB_POD_NAMES" ]]; then for POD_NAME in $ALB_POD_NAMES; do kubectl logs "$POD_NAME" -n "$ALB_CONTROLLER_NAMESPACE" --tail=200 2>/dev/null > "$ALB_CONTROLLER_LOGS_DIR/${POD_NAME}.log" || echo "" > "$ALB_CONTROLLER_LOGS_DIR/${POD_NAME}.log" done -fi \ No newline at end of file +fi + +# Identify problematic pods and capture their logs + describe +# A pod is "problematic" if any of: +# - phase is not Running and not Succeeded +# - it is being deleted (deletionTimestamp present) +# - Ready condition is not True +# - any container (init or regular) has crashed, restarted, terminated, or is in a known error waiting state +PROBLEMATIC_PODS_FILE="$DATA_DIR/problematic_pods.txt" +POD_LOGS_DIR="$DATA_DIR/pod_logs" +POD_DESCRIBE_DIR="$DATA_DIR/pod_describe" +mkdir -p "$POD_LOGS_DIR" "$POD_DESCRIBE_DIR" + +POD_LOG_TAIL_LINES="${POD_LOG_TAIL_LINES:-500}" + +PROBLEMATIC_POD_NAMES=$(jq -r ' + def is_error_waiting(reason): + reason | IN("CrashLoopBackOff","ImagePullBackOff","ErrImagePull","CreateContainerError","RunContainerError","CreateContainerConfigError"); + + def container_unhealthy(c): + (c.restartCount // 0) > 0 + or (c.state.terminated // null) != null + or (c.lastState.terminated // null) != null + or (c.state.waiting // null) != null and is_error_waiting(c.state.waiting.reason // ""); + + .items[] + | select( + (.status.phase != "Running" and .status.phase != "Succeeded") + or (.metadata.deletionTimestamp // null) != null + or ((.status.conditions // []) | any(.type == "Ready" and .status != "True")) + or ((.status.containerStatuses // []) | any(container_unhealthy(.))) + or ((.status.initContainerStatuses // []) | any(container_unhealthy(.))) + ) + | .metadata.name +' "$PODS_FILE" 2>/dev/null) + +echo "$PROBLEMATIC_POD_NAMES" > "$PROBLEMATIC_PODS_FILE" +export PROBLEMATIC_PODS_FILE +export POD_LOGS_DIR +export POD_DESCRIBE_DIR +export POD_LOG_TAIL_LINES + +if [[ -n "$PROBLEMATIC_POD_NAMES" ]]; then + for POD_NAME in $PROBLEMATIC_POD_NAMES; do + # Describe (full output, includes spec + status + events correlated) + kubectl describe pod "$POD_NAME" -n "$NAMESPACE" > "$POD_DESCRIBE_DIR/${POD_NAME}.txt" 2>/dev/null || echo "" > "$POD_DESCRIBE_DIR/${POD_NAME}.txt" + + # Capture logs for every container (init + regular), current and previous + ALL_CONTAINERS=$(jq -r --arg name "$POD_NAME" ' + .items[] + | select(.metadata.name == $name) + | ((.spec.initContainers // []) + (.spec.containers // [])) + | .[].name + ' "$PODS_FILE" 2>/dev/null) + + for CONTAINER_NAME in $ALL_CONTAINERS; do + CURRENT_LOG="$POD_LOGS_DIR/${POD_NAME}.${CONTAINER_NAME}.log" + PREVIOUS_LOG="$POD_LOGS_DIR/${POD_NAME}.${CONTAINER_NAME}.previous.log" + + # Current container logs (always kept, even if empty: "container produced no output yet" is meaningful) + kubectl logs "$POD_NAME" -n "$NAMESPACE" -c "$CONTAINER_NAME" --tail="$POD_LOG_TAIL_LINES" \ + > "$CURRENT_LOG" 2>/dev/null || echo "" > "$CURRENT_LOG" + + # Previous container logs (kubectl exits 1 when there is no previous instance — expected, ignore) + kubectl logs "$POD_NAME" -n "$NAMESPACE" -c "$CONTAINER_NAME" --tail="$POD_LOG_TAIL_LINES" --previous \ + > "$PREVIOUS_LOG" 2>/dev/null || true + if [[ ! -s "$PREVIOUS_LOG" ]]; then + rm -f "$PREVIOUS_LOG" + fi + done + done +fi + +# Always end with success: build_context is sourced, and a trailing non-zero status +# from any conditional above would propagate to the caller. +: \ No newline at end of file diff --git a/k8s/diagnose/logs/application_log_evidence b/k8s/diagnose/logs/application_log_evidence new file mode 100644 index 00000000..11744f83 --- /dev/null +++ b/k8s/diagnose/logs/application_log_evidence @@ -0,0 +1,114 @@ +#!/bin/bash +# Check: Application Log Evidence +# +# Purpose: surface the logs of the user-owned "application" container from +# each problematic pod so they are visible in the diagnose UI's check.logs[] +# view (the last 20 stdout lines of the check). +# +# Scope is intentionally narrow: +# - Only the container literally named "application" (the user code; see +# k8s/deployment/templates/deployment.yaml.tpl). Sidecars (e.g. "http" +# nginx) and init containers are out of scope here — the scope/ checks +# already report their findings without us duplicating logs across +# containers and probe rounds. +# - Logs are echoed to stdout only; the evidence payload carries only +# counters and the list of pods (in evidence.affected). Re-emitting the +# log text inside evidence.details would duplicate what already lives in +# the check's logs[] tail. +# +# Reads from the build_context snapshot (data/pod_logs/), never calls kubectl. +# Severity is always "info"; this check publishes context, it does not detect +# failure modes. + +APPLICATION_CONTAINER_NAME="application" + +if [[ ! -f "$PROBLEMATIC_PODS_FILE" ]]; then + print_warning "No problematic pods snapshot available, log evidence skipped" + SKIP_EVIDENCE=$(evidence_json \ + "Snapshot unavailable, log collection skipped" \ + "info" \ + "[]" \ + "$(jq -nc '{pods_with_logs: 0, problematic_pod_count: 0}')" \ + "[]") + update_check_result --status "skipped" --evidence "$SKIP_EVIDENCE" + return 0 +fi + +PROBLEMATIC_PODS=$(grep -v '^[[:space:]]*$' "$PROBLEMATIC_PODS_FILE" 2>/dev/null) + +if [[ -z "$PROBLEMATIC_PODS" ]]; then + print_success "No problematic pods detected — no application logs to collect" + SUMMARY="No problematic pods detected, no application logs to collect" + DETAILS=$(jq -nc '{pods_with_logs: 0, problematic_pod_count: 0}') + EVIDENCE=$(evidence_json "$SUMMARY" "info" "[]" "$DETAILS" "[]") + update_check_result --status "success" --evidence "$EVIDENCE" --log-tail-lines 200 + return 0 +fi + +AFFECTED_PODS="" +PODS_WITH_LOGS=0 +TOTAL_PROBLEMATIC=$(echo "$PROBLEMATIC_PODS" | wc -w | tr -d ' ') + +for POD_NAME in $PROBLEMATIC_PODS; do + HAS_APP_CONTAINER=$(jq -r --arg name "$POD_NAME" --arg cn "$APPLICATION_CONTAINER_NAME" ' + .items[] + | select(.metadata.name == $name) + | (.spec.containers // []) + | map(.name) + | index($cn) != null + ' "$PODS_FILE" 2>/dev/null) + + if [[ "$HAS_APP_CONTAINER" != "true" ]]; then + print_warning "Pod $POD_NAME has no '$APPLICATION_CONTAINER_NAME' container — skipped" + continue + fi + + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$APPLICATION_CONTAINER_NAME" "current") + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$APPLICATION_CONTAINER_NAME" "previous") + + # Merge previous + current in chronological order, then keep only the last + # N lines (defaults to 50). One flat array; the AI does not need to know + # which container instance produced which line. + TAIL_LINES="${EVIDENCE_LOG_TAIL_LINES:-50}" + MERGED_LOGS=$(jq -nc \ + --argjson prev "$PREVIOUS_LOGS" \ + --argjson curr "$CURRENT_LOGS" \ + --argjson n "$TAIL_LINES" \ + '($prev + $curr) | .[-$n:]') + + if [[ "$(echo "$MERGED_LOGS" | jq 'length')" -eq 0 ]]; then + print_warning "Pod $POD_NAME application container produced no logs" + continue + fi + + mark_affected AFFECTED_PODS "$POD_NAME" + PODS_WITH_LOGS=$((PODS_WITH_LOGS + 1)) + + # Echo the log tail to stdout so it surfaces in the UI's check.logs view. + # update_check_result is called below with --log-tail-lines 200 so the cap + # accommodates the application log payload (default cap is 20). This is + # the only place the log text lives — evidence.details stores only + # counters, so there is no duplication between evidence and logs[]. + print_info "─── application log tail from $POD_NAME ───" + echo "$MERGED_LOGS" | jq -r '.[] | " | \(.)"' +done + +AFFECTED_PODS_JSON=$(set_to_json_array AFFECTED_PODS) + +if [[ $PODS_WITH_LOGS -eq 0 ]]; then + SUMMARY="No application logs available across $TOTAL_PROBLEMATIC problematic pod(s) — image may never have started" + DETAILS=$(jq -nc \ + --argjson problematic "$TOTAL_PROBLEMATIC" \ + '{pods_with_logs: 0, problematic_pod_count: $problematic}') + EVIDENCE=$(evidence_json "$SUMMARY" "info" "[]" "$DETAILS" "[]") + update_check_result --status "success" --evidence "$EVIDENCE" --log-tail-lines 200 +else + SUMMARY="Collected application logs from $PODS_WITH_LOGS of $TOTAL_PROBLEMATIC problematic pod(s)" + DETAILS=$(jq -nc \ + --argjson with_logs "$PODS_WITH_LOGS" \ + --argjson problematic "$TOTAL_PROBLEMATIC" \ + '{pods_with_logs: $with_logs, problematic_pod_count: $problematic}') + EVIDENCE=$(evidence_json "$SUMMARY" "info" "$AFFECTED_PODS_JSON" "$DETAILS" "[]") + update_check_result --status "success" --evidence "$EVIDENCE" --log-tail-lines 200 + print_success "$SUMMARY" +fi diff --git a/k8s/diagnose/logs/workflow.yml b/k8s/diagnose/logs/workflow.yml new file mode 100644 index 00000000..3ea26aee --- /dev/null +++ b/k8s/diagnose/logs/workflow.yml @@ -0,0 +1,6 @@ +steps: + - name: Application Log Evidence + description: Collects pod logs from the diagnose snapshot for AI post-mortem analysis + category: Application Logs + type: script + file: "$SERVICE_PATH/diagnose/logs/application_log_evidence" diff --git a/k8s/diagnose/networking/alb_capacity_check b/k8s/diagnose/networking/alb_capacity_check index 445971f4..2431b28f 100644 --- a/k8s/diagnose/networking/alb_capacity_check +++ b/k8s/diagnose/networking/alb_capacity_check @@ -2,35 +2,39 @@ # Check: ALB Capacity Check # Checks for common ALB issues (IP exhaustion, certificate problems) -# Validate ingresses exist require_ingresses || return 0 -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" +HAS_IP_EXHAUSTION=0 +IP_EXHAUSTION_LOGS="[]" + -# Get ALB controller pods from pre-collected data ALB_CONTROLLER_PODS=$(jq -r '.items[].metadata.name' "$ALB_CONTROLLER_PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -n "$ALB_CONTROLLER_PODS" ]]; then for POD in $ALB_CONTROLLER_PODS; do - # Look for IP exhaustion errors in pre-collected controller logs LOG_FILE="$ALB_CONTROLLER_LOGS_DIR/${POD}.log" if [[ -f "$LOG_FILE" ]] && [[ -r "$LOG_FILE" ]]; then - # Use tail and awk to handle massive log lines efficiently IP_ERRORS=$(tail -n 500 "$LOG_FILE" 2>/dev/null | \ awk 'length <= 10000' 2>/dev/null | \ grep -iE "no available ip|insufficient ip|ip address.*(exhausted|unavailable)" 2>/dev/null || true) if [[ -n "$IP_ERRORS" ]]; then - HAS_ISSUES=1 + HAS_IP_EXHAUSTION=1 print_error " ALB subnet IP exhaustion detected, Recent logs:" if ! echo "$IP_ERRORS" | tail -n 3 2>/dev/null | cut -c1-200 2>/dev/null | sed 's/^/ /' 2>/dev/null; then print_warning " [Log details could not be displayed]" fi print_action "Check subnet CIDR ranges and consider expanding or using different subnets" print_info " Annotation: alb.ingress.kubernetes.io/subnets=" + + TRUNC=$(echo "$IP_ERRORS" | tail -n 3 | cut -c1-200 | jq -R . | jq -s .) + IP_EXHAUSTION_LOGS=$(echo "$IP_EXHAUSTION_LOGS" | jq --arg pod "$POD" --argjson lines "$TRUNC" \ + '. + [{controller_pod: $pod, lines: $lines}]') break fi elif [[ -e "$LOG_FILE" ]] && [[ ! -r "$LOG_FILE" ]]; then @@ -38,22 +42,32 @@ if [[ -n "$ALB_CONTROLLER_PODS" ]]; then fi done - if [[ -z "$IP_ERRORS" ]]; then + if [[ $HAS_IP_EXHAUSTION -eq 0 ]]; then print_success " No IP exhaustion issues detected" fi fi -# Consolidated loop: check all ingress-related issues in one pass +[[ $HAS_IP_EXHAUSTION -eq 1 ]] && { + ISSUE=$(jq -nc --argjson logs "$IP_EXHAUSTION_LOGS" \ + '{issue: "subnet_ip_exhaustion", evidence_logs: $logs}') + add_fact ISSUE_FACTS "$ISSUE" + # mark all ingresses as affected (cluster-wide issue) + for INGRESS_NAME in $INGRESSES; do mark_affected AFFECTED_INGRESSES "$INGRESS_NAME"; done +} + for INGRESS_NAME in $INGRESSES; do - # Get ingress info from pre-collected data (single read per ingress) INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - print_info "Checking ingress: $INGRESS_NAME" - # ===== TLS/Certificate Configuration Checks ===== + INGRESS_HAS_ISSUE=0 CERT_ARN=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/certificate-arn"] // empty') TLS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.tls[]?.hosts[]?' 2>/dev/null) INGRESS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[]?.host' 2>/dev/null) + SCHEME=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/scheme"] // empty') + SUBNETS=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/subnets"] // empty') + + CERT_ERROR_LINES="[]" + HOST_TLS_MISMATCHES="[]" if [[ -n "$TLS_HOSTS" || -n "$CERT_ARN" ]]; then print_info " SSL/TLS configured" @@ -61,42 +75,51 @@ for INGRESS_NAME in $INGRESSES; do if [[ -n "$CERT_ARN" ]]; then print_info " Certificate ARN: $CERT_ARN" - # Check controller logs for certificate errors if [[ -n "$ALB_CONTROLLER_PODS" ]]; then for POD in $ALB_CONTROLLER_PODS; do LOG_FILE="$ALB_CONTROLLER_LOGS_DIR/${POD}.log" if [[ -f "$LOG_FILE" ]] && [[ -r "$LOG_FILE" ]]; then - # Use tail and awk to handle massive log lines efficiently CERT_ERRORS=$(tail -n 500 "$LOG_FILE" 2>/dev/null | \ awk 'length <= 10000' 2>/dev/null | \ grep -iF "$INGRESS_NAME" 2>/dev/null | \ grep -iE "certificate.*(not found|invalid|failed|error)" 2>/dev/null || true) if [[ -n "$CERT_ERRORS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Certificate validation errors found:" if ! echo "$CERT_ERRORS" | tail -n 2 2>/dev/null | cut -c1-200 2>/dev/null | sed 's/^/ /' 2>/dev/null; then print_warning " [Certificate error details could not be displayed]" fi print_action "Verify certificate ARN exists in ACM and covers the required domains" + + TRUNC=$(echo "$CERT_ERRORS" | tail -n 2 | cut -c1-200 | jq -R . | jq -s .) + CERT_ERROR_LINES=$(echo "$CERT_ERROR_LINES" | jq --arg pod "$POD" --argjson lines "$TRUNC" \ + '. + [{controller_pod: $pod, lines: $lines}]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson lines "$TRUNC" \ + '{ingress: $ing, issue: "certificate_validation_errors", log_lines: $lines}') + add_fact ISSUE_FACTS "$ISSUE" fi fi done fi fi - # Verify hosts match between rules and TLS if [[ -n "$TLS_HOSTS" && -n "$INGRESS_HOSTS" ]]; then for HOST in $INGRESS_HOSTS; do if ! echo "$TLS_HOSTS" | grep -qw "$HOST"; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Host '$HOST' in rules but not in TLS configuration" print_action "Add host to spec.tls or ensure certificate covers this domain" + HOST_TLS_MISMATCHES=$(echo "$HOST_TLS_MISMATCHES" | jq --arg h "$HOST" '. + [$h]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg host "$HOST" \ + '{ingress: $ing, host: $host, issue: "host_in_rules_not_in_tls"}') + add_fact ISSUE_FACTS "$ISSUE" fi done fi - # Check for missing certificate when TLS is configured if [[ -n "$TLS_HOSTS" && -z "$CERT_ARN" ]]; then print_warning " TLS hosts configured but no ACM certificate ARN annotation" print_info " Add annotation: alb.ingress.kubernetes.io/certificate-arn=" @@ -105,8 +128,6 @@ for INGRESS_NAME in $INGRESSES; do print_info " No SSL/TLS configured (HTTP only)" fi - # ===== Events Checks (subnet, security group, target group) ===== - # Get events sorted by timestamp, most recent first EVENTS_JSON=$(jq --arg name "$INGRESS_NAME" --arg kind "Ingress" ' .items | map(select(.involvedObject.name == $name and .involvedObject.kind == $kind)) @@ -114,65 +135,114 @@ for INGRESS_NAME in $INGRESSES; do | reverse ' "$EVENTS_FILE" 2>/dev/null) + SUBNET_ERROR_LINES="[]" + SG_ERROR_LINES="[]" + TG_ERROR_LINES="[]" + EVENT_COUNT=$(echo "$EVENTS_JSON" | jq 'length' 2>/dev/null) if [[ "$EVENT_COUNT" -gt 0 ]]; then - # Get all error/warning events - ERROR_EVENTS=$(echo "$EVENTS_JSON" | jq -r ' - .[] - | select(.type == "Warning" or .type == "Error") - ' 2>/dev/null) - - if [[ -n "$ERROR_EVENTS" ]]; then - # Check for subnet errors - SUBNET_ERRORS=$(echo "$ERROR_EVENTS" | jq -r 'select(.message | test("subnet|availability zone"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) - if [[ -n "$SUBNET_ERRORS" ]]; then - HAS_ISSUES=1 - print_error " Subnet configuration issues" - if ! echo "$SUBNET_ERRORS" | head -n 2 2>/dev/null | sed 's/^/ /' 2>/dev/null; then - print_warning " [Event details could not be displayed]" - fi - fi + ERROR_EVENTS=$(echo "$EVENTS_JSON" | jq -c '[.[] | select(.type == "Warning" or .type == "Error")]' 2>/dev/null) + + SUBNET_ERRORS_HUMAN=$(echo "$ERROR_EVENTS" | jq -r '.[] | select(.message | test("subnet|availability zone"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) + if [[ -n "$SUBNET_ERRORS_HUMAN" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Subnet configuration issues" + echo "$SUBNET_ERRORS_HUMAN" | head -n 2 | sed 's/^/ /' + SUBNET_ERROR_LINES=$(echo "$ERROR_EVENTS" | jq -c '[.[] | select(.message | test("subnet|availability zone"; "i")) | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] | .[:2]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson events "$SUBNET_ERROR_LINES" \ + '{ingress: $ing, issue: "subnet_misconfiguration", events: $events}') + add_fact ISSUE_FACTS "$ISSUE" + fi - # Check for security group errors - SG_ERRORS=$(echo "$ERROR_EVENTS" | jq -r 'select(.message | test("security.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) - if [[ -n "$SG_ERRORS" ]]; then - HAS_ISSUES=1 - print_error " Security group issues" - if ! echo "$SG_ERRORS" | head -n 2 2>/dev/null | sed 's/^/ /' 2>/dev/null; then - print_warning " [Event details could not be displayed]" - fi - fi + SG_ERRORS_HUMAN=$(echo "$ERROR_EVENTS" | jq -r '.[] | select(.message | test("security.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) + if [[ -n "$SG_ERRORS_HUMAN" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Security group issues" + echo "$SG_ERRORS_HUMAN" | head -n 2 | sed 's/^/ /' + SG_ERROR_LINES=$(echo "$ERROR_EVENTS" | jq -c '[.[] | select(.message | test("security.?group"; "i")) | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] | .[:2]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson events "$SG_ERROR_LINES" \ + '{ingress: $ing, issue: "security_group", events: $events}') + add_fact ISSUE_FACTS "$ISSUE" + fi - # Check for target group errors - TG_ERRORS=$(echo "$ERROR_EVENTS" | jq -r 'select(.message | test("target.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) - if [[ -n "$TG_ERRORS" ]]; then - HAS_ISSUES=1 - print_error " Target group registration issues" - if ! echo "$TG_ERRORS" | head -n 2 2>/dev/null | sed 's/^/ /' 2>/dev/null; then - print_warning " [Event details could not be displayed]" - fi - fi + TG_ERRORS_HUMAN=$(echo "$ERROR_EVENTS" | jq -r '.[] | select(.message | test("target.?group"; "i")) | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)"' 2>/dev/null || true) + if [[ -n "$TG_ERRORS_HUMAN" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Target group registration issues" + echo "$TG_ERRORS_HUMAN" | head -n 2 | sed 's/^/ /' + TG_ERROR_LINES=$(echo "$ERROR_EVENTS" | jq -c '[.[] | select(.message | test("target.?group"; "i")) | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] | .[:2]') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson events "$TG_ERROR_LINES" \ + '{ingress: $ing, issue: "target_group", events: $events}') + add_fact ISSUE_FACTS "$ISSUE" fi fi - # ===== Annotation Checks (scheme, subnets) ===== - SCHEME=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/scheme"] // empty') if [[ -z "$SCHEME" ]]; then print_warning " No scheme annotation (defaulting to internal)" print_info " Add annotation: alb.ingress.kubernetes.io/scheme=internet-facing (or internal)" fi - - SUBNETS=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/subnets"] // empty') if [[ -z "$SUBNETS" ]]; then print_info " Using auto-discovered subnets" print_info " Consider explicit subnets: alb.ingress.kubernetes.io/subnets=" fi + + INGRESS_FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cert "$CERT_ARN" --arg scheme "$SCHEME" --arg subnets "$SUBNETS" \ + --argjson tls_configured "$([[ -n "$TLS_HOSTS" || -n "$CERT_ARN" ]] && echo true || echo false)" \ + --argjson host_mismatches "$HOST_TLS_MISMATCHES" \ + --argjson cert_errors "$CERT_ERROR_LINES" \ + --argjson subnet_errors "$SUBNET_ERROR_LINES" \ + --argjson sg_errors "$SG_ERROR_LINES" \ + --argjson tg_errors "$TG_ERROR_LINES" \ + '{ + ingress: $ing, + tls_configured: $tls_configured, + certificate_arn: (if $cert == "" then null else $cert end), + scheme: (if $scheme == "" then null else $scheme end), + subnets_annotation: (if $subnets == "" then null else $subnets end), + host_tls_mismatches: $host_mismatches, + certificate_errors: $cert_errors, + subnet_errors: $subnet_errors, + security_group_errors: $sg_errors, + target_group_errors: $tg_errors + }') + add_fact INGRESS_FACTS "$INGRESS_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "No critical ALB capacity or configuration issues detected" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "No critical ALB capacity or configuration issues detected" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) affected by ALB issues" + [[ $HAS_IP_EXHAUSTION -eq 1 ]] && SUMMARY="$SUMMARY — subnet IP exhaustion detected" + + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + --argjson ip_logs "$IP_EXHAUSTION_LOGS" \ + --argjson ip_exhaustion "$HAS_IP_EXHAUSTION" \ + '{ + ingress_count: $count, + issue_count: ($issues | length), + ip_exhaustion_detected: ($ip_exhaustion == 1), + ip_exhaustion_logs: $ip_logs, + ingresses: $facts, + issues: $issues + }') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Check subnet capacity and certificate/security group configuration"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_backend_service b/k8s/diagnose/networking/ingress_backend_service index e20a570a..10ad9957 100644 --- a/k8s/diagnose/networking/ingress_backend_service +++ b/k8s/diagnose/networking/ingress_backend_service @@ -2,99 +2,108 @@ # Check: Ingress Backend Service # Checks if ingress backend services exist and are reachable -# Validate ingresses exist require_ingresses || return 0 -# Get ingresses INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" + for INGRESS_NAME in $INGRESSES; do INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - print_info "Checking backends for ingress: $INGRESS_NAME" - # Get default backend if exists + INGRESS_BACKEND_FACTS=() + INGRESS_HAS_ISSUE=0 + DEFAULT_BACKEND=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.name // empty') if [[ -n "$DEFAULT_BACKEND" ]]; then DEFAULT_PORT=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.port.number // .spec.defaultBackend.service.port.name // empty') - - # Check if service exists in pre-collected data SERVICE_INFO=$(jq --arg name "$DEFAULT_BACKEND" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) if [[ -n "$SERVICE_INFO" && "$SERVICE_INFO" != "null" ]]; then - # Check if service has endpoints from pre-collected data ENDPOINT_INFO=$(jq --arg name "$DEFAULT_BACKEND" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null) ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[].addresses[].ip' 2>/dev/null | tr '\n' ' ') if [[ -n "$ENDPOINTS" ]]; then print_success " Default backend: $DEFAULT_BACKEND:$DEFAULT_PORT (has endpoints)" + BFACT=$(jq -nc --arg svc "$DEFAULT_BACKEND" --arg port "$DEFAULT_PORT" \ + '{kind: "default", service: $svc, port: $port, status: "ok"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Default backend: $DEFAULT_BACKEND:$DEFAULT_PORT (no endpoints)" + BFACT=$(jq -nc --arg svc "$DEFAULT_BACKEND" --arg port "$DEFAULT_PORT" \ + '{kind: "default", service: $svc, port: $port, status: "no_endpoints"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$DEFAULT_BACKEND" \ + '{ingress: $ing, backend: $svc, issue: "default_backend_no_endpoints"}') + add_fact ISSUE_FACTS "$ISSUE" fi else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Default backend: Service '$DEFAULT_BACKEND' not found" + BFACT=$(jq -nc --arg svc "$DEFAULT_BACKEND" \ + '{kind: "default", service: $svc, status: "service_not_found"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$DEFAULT_BACKEND" \ + '{ingress: $ing, backend: $svc, issue: "default_backend_not_found"}') + add_fact ISSUE_FACTS "$ISSUE" fi + add_fact INGRESS_BACKEND_FACTS "$BFACT" fi - # Get all rule backends BACKENDS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[].http.paths[] | "\(.backend.service.name):\(.backend.service.port.number // .backend.service.port.name)"' 2>/dev/null) - if [[ -z "$BACKENDS" ]]; then + if [[ -z "$BACKENDS" && -z "$DEFAULT_BACKEND" ]]; then print_warning " No path rules defined" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --argjson backends "$INGRESS_BACKEND_FACTS" \ + '{ingress: $ing, backends: $backends}') + add_fact INGRESS_FACTS "$FACT" continue fi - # Check each unique backend - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates while IFS=':' read -r SERVICE_NAME SERVICE_PORT; do - # Check if service exists in pre-collected data + [[ -z "$SERVICE_NAME" ]] && continue + SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) if [[ -n "$SERVICE_INFO" && "$SERVICE_INFO" != "null" ]]; then - # Check if service has endpoints from pre-collected data ENDPOINT_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null) - READY_ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[]?.addresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) - NOT_READY_ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[]?.notReadyAddresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) + READY_COUNT=$(echo "$ENDPOINT_INFO" | jq -r '[.subsets[]?.addresses[]?] | length' 2>/dev/null) + NOT_READY_COUNT=$(echo "$ENDPOINT_INFO" | jq -r '[.subsets[]?.notReadyAddresses[]?] | length' 2>/dev/null) + READY_COUNT=${READY_COUNT:-0} + NOT_READY_COUNT=${NOT_READY_COUNT:-0} - # Get port info PORT_NUMBER=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[0]?.ports[0]?.port // empty' 2>/dev/null) - READY_COUNT=$(echo "$READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) - NOT_READY_COUNT=$(echo "$NOT_READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) - if [[ $READY_COUNT -gt 0 ]]; then print_success " Backend: $SERVICE_NAME:$SERVICE_PORT ($READY_COUNT ready endpoint(s))" - echo "$READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - [[ -n "$IP" ]] && print_success " - $POD_NAME -> $IP:$PORT_NUMBER" + echo "$ENDPOINT_INFO" | jq -r --arg p "$PORT_NUMBER" '.subsets[]?.addresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)" + (if $p == "" then "" else ":" + $p end)' | while IFS= read -r line; do + print_success "$line" done - if [[ $NOT_READY_COUNT -gt 0 ]]; then print_warning " Also has $NOT_READY_COUNT not ready endpoint(s)" - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - [[ -n "$IP" ]] && print_warning " - $POD_NAME -> $IP:$PORT_NUMBER" + echo "$ENDPOINT_INFO" | jq -r --arg p "$PORT_NUMBER" '.subsets[]?.notReadyAddresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)" + (if $p == "" then "" else ":" + $p end)' | while IFS= read -r line; do + print_warning "$line" done fi + BFACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" --argjson ready "$READY_COUNT" --argjson nr "$NOT_READY_COUNT" \ + '{kind: "rule", service: $svc, port: $port, ready_count: $ready, not_ready_count: $nr, status: "ok"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Backend: $SERVICE_NAME:$SERVICE_PORT (no ready endpoints)" - # Get service selector to help debug SERVICE_SELECTOR=$(echo "$SERVICE_INFO" | jq -c '.spec.selector // {}' 2>/dev/null) - print_info " Service selector: $SERVICE_SELECTOR" if [[ $NOT_READY_COUNT -gt 0 ]]; then - print_warning " Found $NOT_READY_COUNT not ready endpoint(s):" - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - [[ -n "$IP" ]] && print_warning " - $POD_NAME -> $IP:$PORT_NUMBER (not ready)" - done + print_warning " Found $NOT_READY_COUNT not ready endpoint(s)" print_action "Check pod readiness - pods exist but are not ready to serve traffic" + SUB_ISSUE="endpoints_not_ready" else - # Check if there are any pods matching the selector if [[ "$SERVICE_SELECTOR" != "{}" && "$SERVICE_SELECTOR" != "null" ]]; then MATCHING_PODS=$(jq -r --argjson selectors "$SERVICE_SELECTOR" ' .items[] | @@ -108,38 +117,77 @@ for INGRESS_NAME in $INGRESSES; do ' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -n "$MATCHING_PODS" ]]; then - print_warning " Found pods matching selector but no endpoints: $MATCHING_PODS" - print_action "Pods exist but endpoints not created - check pod readiness probes and status" + print_warning " Pods match selector but no endpoints — check readiness probes" + SUB_ISSUE="pods_match_no_endpoints" else print_warning " No pods found matching service selector" - print_action "Create pods with labels matching the service selector: $SERVICE_SELECTOR" + SUB_ISSUE="no_matching_pods" fi else print_warning " Service has no selector defined" - print_action "Add selector to service or check if this is a headless/ExternalName service" + SUB_ISSUE="no_selector" fi fi + + BFACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" --argjson nr "$NOT_READY_COUNT" \ + --argjson selector "$SERVICE_SELECTOR" --arg issue "$SUB_ISSUE" \ + '{kind: "rule", service: $svc, port: $port, ready_count: 0, not_ready_count: $nr, selector: $selector, status: $issue}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$SERVICE_NAME" --arg issue "$SUB_ISSUE" \ + '{ingress: $ing, backend: $svc, issue: $issue}') + add_fact ISSUE_FACTS "$ISSUE" fi - # Verify port exists in service from pre-collected data SERVICE_PORTS=$(echo "$SERVICE_INFO" | jq -r '.spec.ports[].port' 2>/dev/null | tr '\n' ' ') - if ! echo "$SERVICE_PORTS" | grep -qw "$SERVICE_PORT"; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Backend: Port $SERVICE_PORT not found in service $SERVICE_NAME" print_warning " Available ports: $SERVICE_PORTS" + BFACT=$(echo "$BFACT" | jq --arg sp "$SERVICE_PORT" --arg ports "$SERVICE_PORTS" \ + '. + {port_status: "port_not_in_service", available_ports: ($ports | split(" ") | map(select(length > 0)))}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" \ + '{ingress: $ing, backend: $svc, port: $port, issue: "port_not_in_service"}') + add_fact ISSUE_FACTS "$ISSUE" fi else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Backend: Service '$SERVICE_NAME' not found in namespace" + BFACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg port "$SERVICE_PORT" \ + '{kind: "rule", service: $svc, port: $port, status: "service_not_found"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg svc "$SERVICE_NAME" \ + '{ingress: $ing, backend: $svc, issue: "service_not_found"}') + add_fact ISSUE_FACTS "$ISSUE" fi + + add_fact INGRESS_BACKEND_FACTS "$BFACT" done < <(echo "$BACKENDS" | sort -u) + + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --argjson backends "$INGRESS_BACKEND_FACTS" \ + '{ingress: $ing, backends: $backends}') + add_fact INGRESS_FACTS "$FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All backend services healthy for $INGRESS_COUNT ingress(es)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All backend services healthy for $INGRESS_COUNT ingress(es)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have backend issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Verify backend services exist and have ready endpoints, and that ports match"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_class_validation b/k8s/diagnose/networking/ingress_class_validation index d796fd91..36ab7274 100644 --- a/k8s/diagnose/networking/ingress_class_validation +++ b/k8s/diagnose/networking/ingress_class_validation @@ -2,63 +2,87 @@ # Check: Ingress Class Validation # Validates ingress class is correctly configured -# Validate ingresses exist require_ingresses || return 0 -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +CLASS_FACTS=() +AFFECTED_INGRESSES="" + -# Get available ingress classes from pre-collected data AVAILABLE_CLASSES=$(jq -r '.items[].metadata.name' "$INGRESSCLASSES_FILE" 2>/dev/null | tr '\n' ' ') +AVAILABLE_CLASSES_JSON=$(jq -c '[.items[].metadata.name]' "$INGRESSCLASSES_FILE" 2>/dev/null) DEFAULT_CLASS=$(jq -r '.items[] | select(.metadata.annotations."ingressclass.kubernetes.io/is-default-class" == "true") | .metadata.name' "$INGRESSCLASSES_FILE" 2>/dev/null) for INGRESS_NAME in $INGRESSES; do - # Get ingress info from pre-collected data INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - # Check spec.ingressClassName (new way) INGRESS_CLASS=$(echo "$INGRESS_INFO" | jq -r '.spec.ingressClassName // empty') + USED_DEPRECATED=false - # Check annotation (old way) if [[ -z "$INGRESS_CLASS" ]]; then INGRESS_CLASS=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["kubernetes.io/ingress.class"] // empty') - if [[ -n "$INGRESS_CLASS" ]]; then print_info "Ingress $INGRESS_NAME: Using deprecated annotation (kubernetes.io/ingress.class)" + USED_DEPRECATED=true fi fi if [[ -z "$INGRESS_CLASS" ]]; then if [[ -n "$DEFAULT_CLASS" ]]; then print_success "Ingress $INGRESS_NAME: Using default IngressClass ($DEFAULT_CLASS)" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cls "$DEFAULT_CLASS" \ + '{ingress: $ing, ingress_class: $cls, source: "default"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" print_error "Ingress $INGRESS_NAME: No IngressClass specified and no default found" print_action "Specify ingressClassName or set a default IngressClass" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" \ + '{ingress: $ing, issue: "no_ingress_class_no_default"}') fi else - # Verify the class exists if echo "$AVAILABLE_CLASSES" | grep -qw "$INGRESS_CLASS"; then print_success "Ingress $INGRESS_NAME: IngressClass '$INGRESS_CLASS' is valid" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cls "$INGRESS_CLASS" --argjson dep "$USED_DEPRECATED" \ + '{ingress: $ing, ingress_class: $cls, used_deprecated_annotation: $dep, source: "explicit"}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" print_error "Ingress $INGRESS_NAME: IngressClass '$INGRESS_CLASS' not found" - if [[ -n "$AVAILABLE_CLASSES" ]]; then print_warning " Available classes: $AVAILABLE_CLASSES" else print_warning " No IngressClasses found in cluster" fi + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg cls "$INGRESS_CLASS" \ + --argjson available "$AVAILABLE_CLASSES_JSON" --argjson dep "$USED_DEPRECATED" \ + '{ingress: $ing, ingress_class: $cls, issue: "ingress_class_not_found", available_classes: $available, used_deprecated_annotation: $dep}') fi fi + add_fact CLASS_FACTS "$FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All $INGRESS_COUNT ingress(es) have valid IngressClass configuration" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $INGRESS_COUNT ingress(es) have valid IngressClass configuration" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array CLASS_FACTS)" --argjson count "$INGRESS_COUNT" --arg default "$DEFAULT_CLASS" \ + '{ingress_count: $count, default_class: $default, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have IngressClass issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array CLASS_FACTS)" \ + --argjson available "$AVAILABLE_CLASSES_JSON" \ + --arg default "$DEFAULT_CLASS" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, available_classes: $available, default_class: $default, ingresses: $facts}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Specify ingressClassName or set a default IngressClass"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_controller_sync b/k8s/diagnose/networking/ingress_controller_sync index 1f0a40d6..ab29065d 100644 --- a/k8s/diagnose/networking/ingress_controller_sync +++ b/k8s/diagnose/networking/ingress_controller_sync @@ -2,15 +2,15 @@ # Check: Ingress Controller Sync # Verifies ALB ingress controller has synchronized successfully -# Validate ingresses exist require_ingresses || return 0 -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" + -# Get ALB controller pods from pre-collected data ALB_CONTROLLER_PODS=$(jq -r '.items[].metadata.name' "$ALB_CONTROLLER_PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$ALB_CONTROLLER_PODS" ]]; then @@ -23,7 +23,6 @@ fi for INGRESS_NAME in $INGRESSES; do print_info "Checking sync status for ingress: $INGRESS_NAME" - # Get ingress events from pre-collected data - sorted by timestamp, most recent first INGRESS_EVENTS_JSON=$(jq --arg name "$INGRESS_NAME" --arg kind "Ingress" ' .items | map(select(.involvedObject.name == $name and .involvedObject.kind == $kind)) @@ -32,47 +31,62 @@ for INGRESS_NAME in $INGRESSES; do ' "$EVENTS_FILE" 2>/dev/null) EVENT_COUNT=$(echo "$INGRESS_EVENTS_JSON" | jq 'length' 2>/dev/null) + EVENT_SUMMARY="null" + INGRESS_HAS_ISSUE=0 + DETECTED_PROBLEMS="[]" + LATEST_ERROR_EVENTS="[]" if [[ "$EVENT_COUNT" -gt 0 ]]; then - # Get the most recent event NEWEST_EVENT=$(echo "$INGRESS_EVENTS_JSON" | jq -r 'first') EVENT_TYPE=$(echo "$NEWEST_EVENT" | jq -r '.type') EVENT_REASON=$(echo "$NEWEST_EVENT" | jq -r '.reason') EVENT_MESSAGE=$(echo "$NEWEST_EVENT" | jq -r '.message') EVENT_TIMESTAMP=$(echo "$NEWEST_EVENT" | jq -r '.lastTimestamp') - # Check for successful reconciliation first + EVENT_SUMMARY=$(jq -nc --arg t "$EVENT_TYPE" --arg r "$EVENT_REASON" --arg m "$EVENT_MESSAGE" --arg ts "$EVENT_TIMESTAMP" \ + '{type: $t, reason: $r, message: $m, last_timestamp: $ts}') + if [[ "$EVENT_REASON" == "SuccessfullyReconciled" ]]; then print_success " ✓ Successfully reconciled at $EVENT_TIMESTAMP" elif [[ "$EVENT_TYPE" == "Normal" ]] && echo "$EVENT_REASON" | grep -qiE "ensured|synced"; then print_success " ✓ Last event: $EVENT_REASON at $EVENT_TIMESTAMP" else - # Look for error/warning events in recent history ERROR_EVENTS=$(echo "$INGRESS_EVENTS_JSON" | jq -r ' .[] | select(.type == "Warning" or .type == "Error") | "\(.lastTimestamp) [\(.type)] \(.reason): \(.message)" ' | head -n 5) + LATEST_ERROR_EVENTS=$(echo "$INGRESS_EVENTS_JSON" | jq -c ' + [.[] + | select(.type == "Warning" or .type == "Error") + | {timestamp: .lastTimestamp, type: .type, reason: .reason, message: .message}] + | .[:5] + ') if [[ -n "$ERROR_EVENTS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Found error/warning events:" echo "$ERROR_EVENTS" | sed 's/^/ /' - # Check for specific ALB errors in all error events ALL_ERROR_MESSAGES=$(echo "$INGRESS_EVENTS_JSON" | jq -r '.[] | select(.type == "Warning" or .type == "Error") | .message' 2>/dev/null) if echo "$ALL_ERROR_MESSAGES" | grep -qi "failed to reconcile"; then print_error " Issue: Failed to reconcile ingress" + DETECTED_PROBLEMS=$(echo "$DETECTED_PROBLEMS" | jq '. + ["failed_to_reconcile"]') fi - if echo "$ALL_ERROR_MESSAGES" | grep -qi "no available ip\|insufficient.*address"; then print_error " Issue: No available IPs in subnet (see alb_capacity_check)" + DETECTED_PROBLEMS=$(echo "$DETECTED_PROBLEMS" | jq '. + ["subnet_ip_exhaustion"]') fi - if echo "$ALL_ERROR_MESSAGES" | grep -qi "certificate\|tls.*secret"; then print_error " Issue: Certificate problem detected (see ingress_tls_configuration)" + DETECTED_PROBLEMS=$(echo "$DETECTED_PROBLEMS" | jq '. + ["certificate_issue"]') fi + + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --argjson problems "$DETECTED_PROBLEMS" --argjson events "$LATEST_ERROR_EVENTS" \ + '{ingress: $ing, issue: "sync_errors", detected_problems: $problems, recent_events: $events}') + add_fact ISSUE_FACTS "$ISSUE" else print_info " Last event: $EVENT_REASON at $EVENT_TIMESTAMP" fi @@ -81,32 +95,37 @@ for INGRESS_NAME in $INGRESSES; do print_warning " No events found for this ingress" fi - # Check ALB controller logs if pods are found + CONTROLLER_LOG_ERRORS="[]" if [[ -n "$ALB_CONTROLLER_PODS" ]]; then for POD in $ALB_CONTROLLER_PODS; do - # Get recent logs related to this ingress from pre-collected logs LOG_FILE="$ALB_CONTROLLER_LOGS_DIR/${POD}.log" if [[ -f "$LOG_FILE" ]] && [[ -r "$LOG_FILE" ]]; then - # Use tail to limit log size and grep with line-buffered to avoid memory issues - # Skip lines longer than 10000 chars to avoid processing massive JSON lines CONTROLLER_LOGS=$(tail -n 500 "$LOG_FILE" 2>/dev/null | \ awk 'length <= 10000' 2>/dev/null | \ grep -iF "$INGRESS_NAME" 2>/dev/null || true) if [[ -n "$CONTROLLER_LOGS" ]]; then - # Look for errors in controller logs (excluding "successfully built model" info logs) ERROR_LOGS=$(echo "$CONTROLLER_LOGS" | \ grep -ivE "successfully built model|successfully reconciled" 2>/dev/null | \ grep -iE "level.*error|level.*warn|failed|warning" 2>/dev/null | \ head -n 5 || true) if [[ -n "$ERROR_LOGS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " Found errors in ALB controller logs:" - # Safely print error logs with proper error handling and truncation if ! echo "$ERROR_LOGS" | head -n 3 2>/dev/null | cut -c1-200 2>/dev/null | sed 's/^/ /' 2>/dev/null; then print_warning " [Error logs could not be displayed due to formatting issues]" fi + + # Add up to 3 truncated error log lines to facts + TRUNC_LOGS=$(echo "$ERROR_LOGS" | head -n 3 | cut -c1-200 | jq -R . | jq -s .) + CONTROLLER_LOG_ERRORS=$(echo "$CONTROLLER_LOG_ERRORS" | jq --arg pod "$POD" --argjson lines "$TRUNC_LOGS" \ + '. + [{pod: $pod, lines: $lines}]') + + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg pod "$POD" --argjson lines "$TRUNC_LOGS" \ + '{ingress: $ing, issue: "controller_log_errors", controller_pod: $pod, log_lines: $lines}') + add_fact ISSUE_FACTS "$ISSUE" else print_success " No errors in ALB controller logs for this ingress" fi @@ -119,22 +138,46 @@ for INGRESS_NAME in $INGRESSES; do done fi - # Check ingress status/address from pre-collected data INGRESS_ADDRESS=$(jq -r --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name) | .status.loadBalancer.ingress[0].hostname // empty' "$INGRESSES_FILE" 2>/dev/null) if [[ -z "$INGRESS_ADDRESS" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " ALB address not assigned yet (sync may be in progress or failing)" print_action "Check ingress controller logs and verify backend services are healthy" + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" '{ingress: $ing, issue: "alb_address_not_assigned"}') + add_fact ISSUE_FACTS "$ISSUE" else print_success " ALB address assigned: $INGRESS_ADDRESS" fi + + INGRESS_FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg addr "$INGRESS_ADDRESS" \ + --argjson event "$EVENT_SUMMARY" --argjson detected "$DETECTED_PROBLEMS" \ + --argjson controller_errors "$CONTROLLER_LOG_ERRORS" --argjson recent "$LATEST_ERROR_EVENTS" \ + '{ingress: $ing, address: (if $addr == "" then null else $addr end), latest_event: $event, detected_problems: $detected, controller_log_errors: $controller_errors, recent_error_events: $recent}') + add_fact INGRESS_FACTS "$INGRESS_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All $INGRESS_COUNT ingress(es) synchronized successfully with controller" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $INGRESS_COUNT ingress(es) synchronized successfully" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have controller sync issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Check ingress controller logs and verify backend services are healthy"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_existence b/k8s/diagnose/networking/ingress_existence index 48654bcc..a14871ec 100644 --- a/k8s/diagnose/networking/ingress_existence +++ b/k8s/diagnose/networking/ingress_existence @@ -2,24 +2,39 @@ # Check: Ingress Existence # Verifies that ingress resources exist in the namespace -# Read ingresses from pre-collected data INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$INGRESSES" ]]; then print_error "No ingresses found with labels $SCOPE_LABEL_SELECTOR in namespace $NAMESPACE" print_action "Create ingress resource to expose services externally" - update_check_result --status "failed" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "No ingresses found in namespace $NAMESPACE" \ + "critical" \ + "[]" \ + "$(jq -nc --arg ls "$SCOPE_LABEL_SELECTOR" --arg ns "$NAMESPACE" '{label_selector: $ls, namespace: $ns}')" \ + '["Create ingress resource to expose services externally"]') + update_check_result --status "failed" --evidence "$EVIDENCE" return 1 fi -INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') print_success "Found $INGRESS_COUNT ingress(es): $INGRESSES" +# Build hosts info per ingress +INGRESS_DETAILS=$(jq -c '[.items[] | {name: .metadata.name, hosts: [.spec.rules[]?.host // empty]}]' "$INGRESSES_FILE" 2>/dev/null) + # Show basic ingress info for INGRESS_NAME in $INGRESSES; do - # Get hosts from pre-collected data HOSTS=$(jq -r --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name) | .spec.rules[].host' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') print_info " $INGRESS_NAME hosts: $HOSTS" done -update_check_result --status "success" --evidence "{}" \ No newline at end of file +EVIDENCE=$(evidence_json \ + "Found $INGRESS_COUNT ingress(es) in namespace $NAMESPACE" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$INGRESS_COUNT" --argjson ingresses "$INGRESS_DETAILS" --arg ns "$NAMESPACE" \ + '{ingress_count: $count, ingresses: $ingresses, namespace: $ns}')" \ + "[]") +update_check_result --status "success" --evidence "$EVIDENCE" diff --git a/k8s/diagnose/networking/ingress_host_rules b/k8s/diagnose/networking/ingress_host_rules index 25fd223f..09e6b89b 100644 --- a/k8s/diagnose/networking/ingress_host_rules +++ b/k8s/diagnose/networking/ingress_host_rules @@ -2,20 +2,20 @@ # Check: Ingress Host Rules # Verifies host and path rules are properly configured -# Validate ingresses exist require_ingresses || return 0 -# Get ingresses INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +INGRESS_FACTS=() +AFFECTED_INGRESSES="" + for INGRESS_NAME in $INGRESSES; do INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) print_info "Checking host rules for ingress: $INGRESS_NAME" - # Get ingress address/status INGRESS_ADDRESS=$(echo "$INGRESS_INFO" | jq -r '.status.loadBalancer.ingress[0].ip // .status.loadBalancer.ingress[0].hostname // empty') if [[ -z "$INGRESS_ADDRESS" ]]; then @@ -24,84 +24,121 @@ for INGRESS_NAME in $INGRESSES; do print_info " Ingress address: $INGRESS_ADDRESS" fi - # Check if there are any rules RULE_COUNT=$(echo "$INGRESS_INFO" | jq '.spec.rules | length' 2>/dev/null) + DEFAULT_BACKEND=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.name // empty') - if [[ "$RULE_COUNT" -eq 0 ]]; then - # Check for default backend - DEFAULT_BACKEND=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.name // empty') + INGRESS_RULE_FACTS=() + INGRESS_HAS_ISSUE=0 + if [[ "$RULE_COUNT" -eq 0 ]]; then if [[ -n "$DEFAULT_BACKEND" ]]; then print_success " Catch-all rule using default backend: $DEFAULT_BACKEND" else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " No rules and no default backend configured" print_action "Add at least one rule or configure default backend" - fi - continue - fi - - # Check each rule - RULES=$(echo "$INGRESS_INFO" | jq -c '.spec.rules[]' 2>/dev/null) - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates - while read -r RULE; do - HOST=$(echo "$RULE" | jq -r '.host // "*"') - - # Check if host is defined - if [[ "$HOST" == "*" ]]; then - print_warning " Host: * (catch-all, consider specifying a hostname)" - else - print_success " Host: $HOST" + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" '{ingress: $ing, issue: "no_rules_no_default_backend"}') + add_fact ISSUE_FACTS "$ISSUE" fi + else + RULES=$(echo "$INGRESS_INFO" | jq -c '.spec.rules[]' 2>/dev/null) - # Check paths - PATHS=$(echo "$RULE" | jq -c '.http.paths[]' 2>/dev/null) + while read -r RULE; do + HOST=$(echo "$RULE" | jq -r '.host // "*"') - if [[ -z "$PATHS" ]]; then - HAS_ISSUES=1 - print_error " No paths defined for host $HOST" - print_action "Define at least one path for this host" - continue - fi + if [[ "$HOST" == "*" ]]; then + print_warning " Host: * (catch-all, consider specifying a hostname)" + else + print_success " Host: $HOST" + fi - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates - while read -r PATH_RULE; do - PATH_VALUE=$(echo "$PATH_RULE" | jq -r '.path // "/"') - PATH_TYPE=$(echo "$PATH_RULE" | jq -r '.pathType // "Prefix"') - BACKEND_SERVICE=$(echo "$PATH_RULE" | jq -r '.backend.service.name') - BACKEND_PORT=$(echo "$PATH_RULE" | jq -r '.backend.service.port.number // .backend.service.port.name') + PATHS=$(echo "$RULE" | jq -c '.http.paths[]' 2>/dev/null) + HOST_PATH_FACTS=() - print_info " Path: $PATH_VALUE ($PATH_TYPE) -> $BACKEND_SERVICE:$BACKEND_PORT" + if [[ -z "$PATHS" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " No paths defined for host $HOST" + print_action "Define at least one path for this host" - # Validate pathType - if [[ "$PATH_TYPE" != "Exact" && "$PATH_TYPE" != "Prefix" && "$PATH_TYPE" != "ImplementationSpecific" ]]; then - HAS_ISSUES=1 - print_error " Invalid pathType: $PATH_TYPE (must be Exact, Prefix, or ImplementationSpecific)" - print_action "Use valid pathType value" - fi + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg host "$HOST" \ + '{ingress: $ing, host: $host, issue: "no_paths_for_host"}') + add_fact ISSUE_FACTS "$ISSUE" - # Warn about path conventions - if [[ "$PATH_TYPE" == "Prefix" && "$PATH_VALUE" != "/" && ! "$PATH" =~ ^/.*[^/]$ ]]; then - print_warning " Path ends with '/' - this may cause routing issues with Prefix type" + RULE_FACT=$(jq -nc --arg host "$HOST" '{host: $host, paths: []}') + add_fact INGRESS_RULE_FACTS "$RULE_FACT" + continue fi - done < <(echo "$PATHS") - done < <(echo "$RULES") - # Check for conflicting rules - HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[].host' 2>/dev/null | sort) - DUPLICATE_HOSTS=$(echo "$HOSTS" | uniq -d) - - if [[ -n "$DUPLICATE_HOSTS" ]]; then - print_warning " Duplicate host rules found: $DUPLICATE_HOSTS" - print_info " Multiple path rules for the same host are OK, but verify they don't conflict" + while read -r PATH_RULE; do + PATH_VALUE=$(echo "$PATH_RULE" | jq -r '.path // "/"') + PATH_TYPE=$(echo "$PATH_RULE" | jq -r '.pathType // "Prefix"') + BACKEND_SERVICE=$(echo "$PATH_RULE" | jq -r '.backend.service.name') + BACKEND_PORT=$(echo "$PATH_RULE" | jq -r '.backend.service.port.number // .backend.service.port.name') + + print_info " Path: $PATH_VALUE ($PATH_TYPE) -> $BACKEND_SERVICE:$BACKEND_PORT" + + PATH_VALID=true + if [[ "$PATH_TYPE" != "Exact" && "$PATH_TYPE" != "Prefix" && "$PATH_TYPE" != "ImplementationSpecific" ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + PATH_VALID=false + print_error " Invalid pathType: $PATH_TYPE (must be Exact, Prefix, or ImplementationSpecific)" + print_action "Use valid pathType value" + + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg host "$HOST" --arg path "$PATH_VALUE" --arg pt "$PATH_TYPE" \ + '{ingress: $ing, host: $host, path: $path, path_type: $pt, issue: "invalid_path_type"}') + add_fact ISSUE_FACTS "$ISSUE" + fi + + PATH_FACT=$(jq -nc --arg path "$PATH_VALUE" --arg pt "$PATH_TYPE" \ + --arg svc "$BACKEND_SERVICE" --arg port "$BACKEND_PORT" --argjson valid "$PATH_VALID" \ + '{path: $path, path_type: $pt, backend_service: $svc, backend_port: $port, valid: $valid}') + add_fact HOST_PATH_FACTS "$PATH_FACT" + done < <(echo "$PATHS") + + RULE_FACT=$(jq -nc --arg host "$HOST" --argjson paths "$HOST_PATH_FACTS" \ + '{host: $host, paths: $paths}') + add_fact INGRESS_RULE_FACTS "$RULE_FACT" + done < <(echo "$RULES") + + HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[].host' 2>/dev/null | sort) + DUPLICATE_HOSTS=$(echo "$HOSTS" | uniq -d) + + if [[ -n "$DUPLICATE_HOSTS" ]]; then + print_warning " Duplicate host rules found: $DUPLICATE_HOSTS" + print_info " Multiple path rules for the same host are OK, but verify they don't conflict" + fi fi + + INGRESS_FACT=$(jq -nc --arg ing "$INGRESS_NAME" --arg addr "$INGRESS_ADDRESS" --arg backend "$DEFAULT_BACKEND" \ + --argjson rules "$INGRESS_RULE_FACTS" \ + '{ingress: $ing, address: $addr, default_backend: (if $backend == "" then null else $backend end), rules: $rules}') + add_fact INGRESS_FACTS "$INGRESS_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "Host and path rules valid for all $INGRESS_COUNT ingress(es)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "Host and path rules valid for all $INGRESS_COUNT ingress(es)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array INGRESS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have host/path rule issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array INGRESS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Add at least one rule or configure default backend; use valid pathType values"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/networking/ingress_tls_configuration b/k8s/diagnose/networking/ingress_tls_configuration index 1062a5d0..1ffdc893 100644 --- a/k8s/diagnose/networking/ingress_tls_configuration +++ b/k8s/diagnose/networking/ingress_tls_configuration @@ -2,92 +2,126 @@ # Check: Ingress TLS Configuration # Validates TLS/SSL certificate configuration -# Validate ingresses exist require_ingresses || return 0 -# Get ingresses INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +TLS_FACTS=() +AFFECTED_INGRESSES="" + for INGRESS_NAME in $INGRESSES; do INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null) - # Check if TLS is configured TLS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.tls[]?.hosts[]?' 2>/dev/null) if [[ -z "$TLS_HOSTS" ]]; then print_info "Ingress $INGRESS_NAME: No TLS configuration (HTTP only)" + FACT=$(jq -nc --arg ing "$INGRESS_NAME" '{ingress: $ing, tls_configured: false}') + add_fact TLS_FACTS "$FACT" continue fi print_info "Checking TLS configuration for ingress: $INGRESS_NAME" - - # Get TLS secrets TLS_SECRETS=$(echo "$INGRESS_INFO" | jq -r '.spec.tls[] | "\(.secretName):\(.hosts | join(","))"' 2>/dev/null) - # Use process substitution to avoid subshell and preserve HAS_ISSUES updates + SECRETS_INFO="[]" + INGRESS_HAS_ISSUE=0 + while IFS=':' read -r SECRET_NAME HOSTS; do - # Check if secret exists in pre-collected data SECRET_INFO=$(jq --arg name "$SECRET_NAME" '.items[] | select(.metadata.name == $name)' "$SECRETS_FILE" 2>/dev/null) if [[ -n "$SECRET_INFO" && "$SECRET_INFO" != "null" ]]; then SECRET_TYPE=$(echo "$SECRET_INFO" | jq -r '.type') if [[ "$SECRET_TYPE" == "kubernetes.io/tls" ]]; then - # Check if secret has required keys (metadata only, no actual data) - SECRET_KEYS=$(echo "$SECRET_INFO" | jq -r '.metadata.annotations | keys[]' 2>/dev/null) - - HAS_CERT=$(echo "$SECRET_KEYS" | grep -q "tls.crt" && echo "yes" || echo "no") - HAS_KEY=$(echo "$SECRET_KEYS" | grep -q "tls.key" && echo "yes" || echo "no") - - if [[ "$HAS_CERT" == "yes" && "$HAS_KEY" == "yes" ]]; then - print_success " TLS Secret: $SECRET_NAME (valid for hosts: $HOSTS)" - - # Optional: Check certificate expiration (requires openssl) - if command -v openssl &>/dev/null; then - CERT_DATA=$(kubectl get secret "$SECRET_NAME" -n "$NAMESPACE" -o jsonpath='{.data.tls\.crt}' 2>/dev/null | base64 -d 2>/dev/null) - - if [[ -n "$CERT_DATA" ]]; then - EXPIRY=$(echo "$CERT_DATA" | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) - - if [[ -n "$EXPIRY" ]]; then - print_info " Certificate expires: $EXPIRY" - - # Check if certificate is expired or expiring soon (30 days) - EXPIRY_EPOCH=$(date -d "$EXPIRY" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$EXPIRY" +%s 2>/dev/null) - CURRENT_EPOCH=$(date +%s) - DAYS_UNTIL_EXPIRY=$(( ($EXPIRY_EPOCH - $CURRENT_EPOCH) / 86400 )) - - if [[ $DAYS_UNTIL_EXPIRY -lt 0 ]]; then - HAS_ISSUES=1 - print_error " Certificate has EXPIRED" - elif [[ $DAYS_UNTIL_EXPIRY -lt 30 ]]; then - print_warning " Certificate expires in $DAYS_UNTIL_EXPIRY days" - fi + # Note: build_context strips .data for security, so we cannot inspect cert/key keys here. + # The Secret type kubernetes.io/tls inherently requires tls.crt and tls.key — k8s validates + # this at creation time. So we trust the type and validate the rest live if needed. + print_success " TLS Secret: $SECRET_NAME (valid for hosts: $HOSTS)" + + EXPIRY_INFO="null" + DAYS_UNTIL_EXPIRY="" + + if command -v openssl &>/dev/null; then + CERT_DATA=$(kubectl get secret "$SECRET_NAME" -n "$NAMESPACE" -o jsonpath='{.data.tls\.crt}' 2>/dev/null | base64 -d 2>/dev/null) + if [[ -n "$CERT_DATA" ]]; then + EXPIRY=$(echo "$CERT_DATA" | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) + if [[ -n "$EXPIRY" ]]; then + print_info " Certificate expires: $EXPIRY" + EXPIRY_EPOCH=$(date -d "$EXPIRY" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$EXPIRY" +%s 2>/dev/null) + CURRENT_EPOCH=$(date +%s) + DAYS_UNTIL_EXPIRY=$(( (EXPIRY_EPOCH - CURRENT_EPOCH) / 86400 )) + EXPIRY_INFO=$(jq -nc --arg expiry "$EXPIRY" --argjson days "$DAYS_UNTIL_EXPIRY" \ + '{expires: $expiry, days_remaining: $days}') + + if [[ $DAYS_UNTIL_EXPIRY -lt 0 ]]; then + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 + print_error " Certificate has EXPIRED" + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg secret "$SECRET_NAME" \ + '{ingress: $ing, secret: $secret, issue: "certificate_expired"}') + add_fact ISSUE_FACTS "$ISSUE" + elif [[ $DAYS_UNTIL_EXPIRY -lt 30 ]]; then + print_warning " Certificate expires in $DAYS_UNTIL_EXPIRY days" fi fi fi - else - HAS_ISSUES=1 - print_error " TLS Secret: $SECRET_NAME missing required keys (needs tls.crt and tls.key)" fi + + SECRET_FACT=$(jq -nc --arg secret "$SECRET_NAME" --arg hosts "$HOSTS" --arg type "$SECRET_TYPE" \ + --argjson expiry "$EXPIRY_INFO" \ + '{secret: $secret, hosts: ($hosts | split(",")), type: $type, valid: true, certificate: $expiry}') else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " TLS Secret: $SECRET_NAME has wrong type '$SECRET_TYPE' (expected kubernetes.io/tls)" + SECRET_FACT=$(jq -nc --arg secret "$SECRET_NAME" --arg type "$SECRET_TYPE" \ + '{secret: $secret, type: $type, valid: false, issue: "wrong_secret_type"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg secret "$SECRET_NAME" --arg type "$SECRET_TYPE" \ + '{ingress: $ing, secret: $secret, secret_type: $type, issue: "wrong_secret_type"}') + add_fact ISSUE_FACTS "$ISSUE" fi else - HAS_ISSUES=1 + mark_affected AFFECTED_INGRESSES "$INGRESS_NAME" + INGRESS_HAS_ISSUE=1 print_error " TLS Secret: '$SECRET_NAME' not found in namespace" print_action "Create TLS secret or update ingress configuration" + SECRET_FACT=$(jq -nc --arg secret "$SECRET_NAME" \ + '{secret: $secret, valid: false, issue: "secret_not_found"}') + ISSUE=$(jq -nc --arg ing "$INGRESS_NAME" --arg secret "$SECRET_NAME" \ + '{ingress: $ing, secret: $secret, issue: "secret_not_found"}') + add_fact ISSUE_FACTS "$ISSUE" fi + add_fact SECRETS_INFO "$SECRET_FACT" done < <(echo "$TLS_SECRETS") + + FACT=$(jq -nc --arg ing "$INGRESS_NAME" --argjson secrets "$SECRETS_INFO" \ + '{ingress: $ing, tls_configured: true, secrets: $secrets}') + add_fact TLS_FACTS "$FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - INGRESS_COUNT=$(echo "$INGRESSES" | wc -w) +INGRESS_COUNT=$(echo "$INGRESSES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_INGRESSES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "TLS configuration valid for all $INGRESS_COUNT ingress(es)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "TLS configuration valid for all $INGRESS_COUNT ingress(es)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array TLS_FACTS)" --argjson count "$INGRESS_COUNT" '{ingress_count: $count, ingresses: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $INGRESS_COUNT ingress(es) have TLS issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array TLS_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$INGRESS_COUNT" \ + '{ingress_count: $count, issue_count: ($issues | length), ingresses: $facts, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_INGRESSES)" "$DETAILS" \ + '["Create or fix TLS secrets and ensure they are of type kubernetes.io/tls"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/scope/container_crash_detection b/k8s/diagnose/scope/container_crash_detection index 8a8121c3..04618b7a 100644 --- a/k8s/diagnose/scope/container_crash_detection +++ b/k8s/diagnose/scope/container_crash_detection @@ -8,70 +8,140 @@ require_pods || return 0 # Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_CRASHES=0 +CRASH_LOOP_FACTS=() +TERMINATED_FACTS=() +HIGH_RESTART_FACTS=() +AFFECTED_PODS="" + +HAS_ACTION_CHECK_LOGS=0 +HAS_ACTION_CHECK_TERMINATION=0 +HAS_ACTION_CHECK_INTERMITTENT=0 + +NUM_OOM=0 +NUM_APP_ERROR=0 for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) - # Check for containers in crash states + # ----- CrashLoopBackOff ----- CRASH_LOOP=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.waiting.reason == "CrashLoopBackOff") | .name') if [[ -n "$CRASH_LOOP" ]]; then - HAS_CRASHES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: CrashLoopBackOff in container(s): $CRASH_LOOP" for CONTAINER in $CRASH_LOOP; do RESTART_COUNT=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .restartCount") EXIT_CODE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .lastState.terminated.exitCode // \"N/A\"") TERMINATION_REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .lastState.terminated.reason // \"Unknown\"") + EXIT_MEANING=$(exit_code_meaning "$EXIT_CODE") print_warning " Container: $CONTAINER | Restarts: $RESTART_COUNT | Exit Code: $EXIT_CODE | Reason: $TERMINATION_REASON" - - case "$EXIT_CODE" in - 137) print_warning " Exit 137 = OOMKilled (out of memory)" ;; - 143) print_warning " Exit 143 = SIGTERM (graceful termination)" ;; - 1) print_warning " Exit 1 = Application error" ;; - 139) print_warning " Exit 139 = SIGSEGV (segmentation fault)" ;; - esac + [[ "$EXIT_MEANING" != "Unknown" ]] && print_warning " Exit $EXIT_CODE = $EXIT_MEANING" + + [[ "$EXIT_CODE" == "137" ]] && NUM_OOM=$((NUM_OOM + 1)) + [[ "$EXIT_CODE" == "1" ]] && NUM_APP_ERROR=$((NUM_APP_ERROR + 1)) + + # CrashLoopBackOff: the useful logs are from the previous container + # instance. The current instance is in waiting state with no logs yet. + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "previous") + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "current") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --argjson restarts "${RESTART_COUNT:-0}" \ + --arg exit_code "$EXIT_CODE" \ + --arg exit_meaning "$EXIT_MEANING" \ + --arg reason "$TERMINATION_REASON" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + --argjson current_logs "$CURRENT_LOGS" \ + '{ + pod: $pod, + container: $container, + restart_count: $restarts, + exit_code: $exit_code, + exit_code_meaning: $exit_meaning, + termination_reason: $reason, + previous_logs: $previous_logs, + current_logs: $current_logs + }') + add_fact CRASH_LOOP_FACTS "$FACT" done print_info "Last logs from $POD_NAME:" - kubectl logs "$POD_NAME" -n "$NAMESPACE" --tail=10 2>&1 | sed 's/^/ /' + # Logs are now pre-collected by build_context for problematic pods. + # Fall back to a live kubectl call only if the snapshot is missing. + PRINTED_LOGS=0 + for CONTAINER in $CRASH_LOOP; do + LOG_FILE="$POD_LOGS_DIR/${POD_NAME}.${CONTAINER}.log" + if [[ -s "$LOG_FILE" ]]; then + tail -n 10 "$LOG_FILE" | sed "s|^| [$CONTAINER] |" + PRINTED_LOGS=1 + fi + done + if [[ $PRINTED_LOGS -eq 0 ]]; then + kubectl logs "$POD_NAME" -n "$NAMESPACE" --tail=10 2>&1 | sed 's/^/ /' + fi print_action "Check container logs and fix application startup issues" + HAS_ACTION_CHECK_LOGS=1 fi - # Check for containers that terminated but haven't restarted yet + # ----- Terminated (state.terminated) ----- TERMINATED_CONTAINERS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.terminated) | .name') if [[ -n "$TERMINATED_CONTAINERS" ]]; then - HAS_CRASHES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: Terminated container(s): $TERMINATED_CONTAINERS" for CONTAINER in $TERMINATED_CONTAINERS; do EXIT_CODE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.terminated.exitCode // \"N/A\"") TERMINATION_REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.terminated.reason // \"Unknown\"") RESTART_COUNT=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .restartCount") + EXIT_MEANING=$(exit_code_meaning "$EXIT_CODE") print_warning " Container: $CONTAINER | Exit Code: $EXIT_CODE | Reason: $TERMINATION_REASON | Restarts: $RESTART_COUNT" - - case "$EXIT_CODE" in - 137) print_warning " Exit 137 = OOMKilled (out of memory)" ;; - 143) print_warning " Exit 143 = SIGTERM (graceful termination)" ;; - 1) print_warning " Exit 1 = Application error" ;; - 139) print_warning " Exit 139 = SIGSEGV (segmentation fault)" ;; - 0) print_info " Exit 0 = Clean exit (container finished successfully)" ;; - esac + [[ "$EXIT_MEANING" != "Unknown" ]] && print_warning " Exit $EXIT_CODE = $EXIT_MEANING" + + [[ "$EXIT_CODE" == "137" ]] && NUM_OOM=$((NUM_OOM + 1)) + [[ "$EXIT_CODE" == "1" ]] && NUM_APP_ERROR=$((NUM_APP_ERROR + 1)) + + # state.terminated (not waiting): current logs are from the still- + # terminated instance; previous matters if the container restarted before. + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "current") + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "previous") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --argjson restarts "${RESTART_COUNT:-0}" \ + --arg exit_code "$EXIT_CODE" \ + --arg exit_meaning "$EXIT_MEANING" \ + --arg reason "$TERMINATION_REASON" \ + --argjson current_logs "$CURRENT_LOGS" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + '{ + pod: $pod, + container: $container, + restart_count: $restarts, + exit_code: $exit_code, + exit_code_meaning: $exit_meaning, + termination_reason: $reason, + current_logs: $current_logs, + previous_logs: $previous_logs + }') + add_fact TERMINATED_FACTS "$FACT" done print_action "Check why container terminated and review logs" + HAS_ACTION_CHECK_TERMINATION=1 fi - # Check for containers with high restart counts (even if currently running) + # ----- High restart count (currently running but unstable) ----- HIGH_RESTART_CONTAINERS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.restartCount >= 3) | "\(.name):\(.restartCount)"') if [[ -n "$HIGH_RESTART_CONTAINERS" ]]; then - HAS_CRASHES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_warning "Pod $POD_NAME: Container(s) with high restart count:" while IFS= read -r CONTAINER_INFO; do @@ -80,18 +150,98 @@ for POD_NAME in $PODS; do LAST_EXIT_CODE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER_NAME\") | .lastState.terminated.exitCode // \"N/A\"") LAST_REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER_NAME\") | .lastState.terminated.reason // \"Unknown\"") + LAST_EXIT_MEANING=$(exit_code_meaning "$LAST_EXIT_CODE") print_warning " Container: $CONTAINER_NAME | Restarts: $RESTART_COUNT | Last Exit: $LAST_EXIT_CODE | Reason: $LAST_REASON" + + # High restart count: container running OK now but crashed multiple + # times before. Previous logs are from the most recent crash. + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER_NAME" "previous") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER_NAME" \ + --argjson restarts "${RESTART_COUNT:-0}" \ + --arg last_exit "$LAST_EXIT_CODE" \ + --arg last_meaning "$LAST_EXIT_MEANING" \ + --arg last_reason "$LAST_REASON" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + '{ + pod: $pod, + container: $container, + restart_count: $restarts, + last_exit_code: $last_exit, + last_exit_code_meaning: $last_meaning, + last_termination_reason: $last_reason, + previous_logs: $previous_logs + }') + add_fact HIGH_RESTART_FACTS "$FACT" done <<< "$HIGH_RESTART_CONTAINERS" print_action "Container has restarted multiple times - check for intermittent issues" + HAS_ACTION_CHECK_INTERMITTENT=1 fi done -if [[ $HAS_CRASHES -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(jq '.items | length' "$PODS_FILE") +NUM_CRASH_LOOP=${#CRASH_LOOP_FACTS[@]} +NUM_TERMINATED=${#TERMINATED_FACTS[@]} +NUM_HIGH_RESTART=${#HIGH_RESTART_FACTS[@]} +TOTAL_FINDINGS=$((NUM_CRASH_LOOP + NUM_TERMINATED + NUM_HIGH_RESTART)) +AFFECTED_COUNT=$(echo $AFFECTED_PODS | wc -w | tr -d ' ') + +CRASH_LOOP_FACTS_JSON=$(facts_to_json_array CRASH_LOOP_FACTS) +TERMINATED_FACTS_JSON=$(facts_to_json_array TERMINATED_FACTS) +HIGH_RESTART_FACTS_JSON=$(facts_to_json_array HIGH_RESTART_FACTS) +AFFECTED_PODS_JSON=$(set_to_json_array AFFECTED_PODS) + +if [[ $TOTAL_FINDINGS -eq 0 ]]; then print_success "All $POD_COUNT pod(s) running without crashes or errors" - update_check_result --status "success" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "All $POD_COUNT pod(s) running without crashes or errors" \ + "info" \ + "[]" \ + "$(jq -nc --argjson total "$POD_COUNT" '{pods_checked: $total}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + SUMMARY_PARTS=() + [[ $NUM_OOM -gt 0 ]] && SUMMARY_PARTS+=("$NUM_OOM OOMKilled") + [[ $NUM_APP_ERROR -gt 0 ]] && SUMMARY_PARTS+=("$NUM_APP_ERROR app error") + [[ $NUM_CRASH_LOOP -gt 0 ]] && SUMMARY_PARTS+=("$NUM_CRASH_LOOP CrashLoopBackOff") + [[ $NUM_TERMINATED -gt 0 ]] && SUMMARY_PARTS+=("$NUM_TERMINATED terminated") + [[ $NUM_HIGH_RESTART -gt 0 ]] && SUMMARY_PARTS+=("$NUM_HIGH_RESTART high-restart") + SUMMARY_DETAIL=$(IFS=", "; echo "${SUMMARY_PARTS[*]}") + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) crashing — $SUMMARY_DETAIL" + + ACTIONS_ARR=() + [[ $HAS_ACTION_CHECK_LOGS -eq 1 ]] && ACTIONS_ARR+=("Check container logs and fix application startup issues") + [[ $HAS_ACTION_CHECK_TERMINATION -eq 1 ]] && ACTIONS_ARR+=("Check why container terminated and review logs") + [[ $HAS_ACTION_CHECK_INTERMITTENT -eq 1 ]] && ACTIONS_ARR+=("Container has restarted multiple times - check for intermittent issues") + ACTIONS_JSON=$(printf '%s\n' "${ACTIONS_ARR[@]}" | lines_to_json_array) + + DETAILS=$(jq -nc \ + --argjson crash_loop "$CRASH_LOOP_FACTS_JSON" \ + --argjson terminated "$TERMINATED_FACTS_JSON" \ + --argjson high_restart "$HIGH_RESTART_FACTS_JSON" \ + --argjson pod_count "$POD_COUNT" \ + --argjson oom_count "$NUM_OOM" \ + --argjson app_error_count "$NUM_APP_ERROR" \ + '{ + pod_count: $pod_count, + counts: { + crash_loop_back_off: ($crash_loop | length), + terminated: ($terminated | length), + high_restart: ($high_restart | length), + oom_killed: $oom_count, + application_error: $app_error_count + }, + crash_loop_back_off: $crash_loop, + terminated: $terminated, + high_restart: $high_restart + }') + + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$AFFECTED_PODS_JSON" "$DETAILS" "$ACTIONS_JSON") + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/container_port_health b/k8s/diagnose/scope/container_port_health index 78152ee9..37cdc344 100755 --- a/k8s/diagnose/scope/container_port_health +++ b/k8s/diagnose/scope/container_port_health @@ -2,52 +2,54 @@ # Check: Container Port Health # Validates that containers are actually listening on their declared ports -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_PORT_ISSUES=0 CONTAINERS_TESTED=0 CONTAINERS_SKIPPED=0 +ISSUE_FACTS=() +POD_FACTS=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) - # Check if pod is running POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') if [[ "$POD_PHASE" != "Running" ]]; then print_warning "Pod $POD_NAME: Not running (phase: $POD_PHASE), skipping port checks" + FACT=$(jq -nc --arg p "$POD_NAME" --arg phase "$POD_PHASE" '{pod: $p, status: "skipped", reason: "not_running", phase: $phase}') + add_fact POD_FACTS "$FACT" continue fi - # Get pod IP POD_IP=$(echo "$POD_INFO" | jq -r '.status.podIP') if [[ -z "$POD_IP" || "$POD_IP" == "null" ]]; then print_warning "Pod $POD_NAME: No IP assigned, skipping port checks" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, status: "skipped", reason: "no_ip"}') + add_fact POD_FACTS "$FACT" continue fi print_info "Checking pod $POD_NAME:" - # Get all containers with their ports CONTAINERS=$(echo "$POD_INFO" | jq -r '.spec.containers[] | @base64') + POD_CONTAINER_FACTS=() for CONTAINER_B64 in $CONTAINERS; do CONTAINER_DATA=$(echo "$CONTAINER_B64" | base64 -d) CONTAINER_NAME=$(echo "$CONTAINER_DATA" | jq -r '.name') - # Check container status before testing ports CONTAINER_STATUS=$(echo "$POD_INFO" | jq -r --arg name "$CONTAINER_NAME" '.status.containerStatuses[]? | select(.name == $name)') if [[ -z "$CONTAINER_STATUS" ]]; then print_warning " Container '$CONTAINER_NAME': Status not found, skipping" + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" '{container: $c, status: "skipped", reason: "no_status"}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue fi - # Check if container is ready CONTAINER_READY=$(echo "$CONTAINER_STATUS" | jq -r '.ready') CONTAINER_STATE=$(echo "$CONTAINER_STATUS" | jq -r ' if .state.running then "running" @@ -57,28 +59,26 @@ for POD_NAME in $PODS; do end ') - # Get declared ports for this container CONTAINER_PORTS=$(echo "$CONTAINER_DATA" | jq -r '.ports[]? | .containerPort' | tr '\n' ' ') if [[ -z "$CONTAINER_PORTS" ]]; then print_info " Container '$CONTAINER_NAME': No ports declared" + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" '{container: $c, status: "no_ports_declared"}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue fi print_info " Container '$CONTAINER_NAME':" - # If container is not running, explain why we can't test ports if [[ "$CONTAINER_STATE" != "running" ]]; then if [[ "$CONTAINER_STATE" == "waiting" ]]; then WAITING_REASON=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.reason // "Unknown"') WAITING_MESSAGE=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.message // ""') - # Check if it's a normal startup state or a problem case "$WAITING_REASON" in ContainerCreating|PodInitializing|Pulling) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) print_info " Container is starting ($WAITING_REASON) - skipping port checks" - continue ;; CrashLoopBackOff|ImagePullBackOff|ErrImagePull) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) @@ -87,37 +87,46 @@ for POD_NAME in $PODS; do print_warning " Message: $WAITING_MESSAGE" fi print_action "Fix container startup issues (check container_crash_detection results)" - continue ;; *) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) print_warning " Container waiting: $WAITING_REASON - skipping port checks" - continue ;; esac + + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" --arg reason "$WAITING_REASON" \ + '{container: $c, status: "skipped", state: "waiting", reason: $reason}') + add_fact POD_CONTAINER_FACTS "$CFACT" + continue elif [[ "$CONTAINER_STATE" == "terminated" ]]; then EXIT_CODE=$(echo "$CONTAINER_STATUS" | jq -r '.state.terminated.exitCode // "N/A"') TERMINATION_REASON=$(echo "$CONTAINER_STATUS" | jq -r '.state.terminated.reason // "Unknown"') CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) print_warning " Cannot test ports - container terminated (Exit: $EXIT_CODE, Reason: $TERMINATION_REASON)" print_action "Fix container termination (check container_crash_detection results)" + + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" --arg ec "$EXIT_CODE" --arg reason "$TERMINATION_REASON" \ + '{container: $c, status: "skipped", state: "terminated", exit_code: $ec, termination_reason: $reason}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue else print_warning " Container in unknown state - skipping port checks" + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" '{container: $c, status: "skipped", state: "unknown"}') + add_fact POD_CONTAINER_FACTS "$CFACT" continue fi fi - # Container is running - check if it's ready if [[ "$CONTAINER_READY" != "true" ]]; then print_warning " Container is running but not ready - port connectivity may fail" fi - # Test connectivity to each declared port from agent CONTAINERS_TESTED=$((CONTAINERS_TESTED + 1)) + PORT_RESULTS=() + CONTAINER_HAS_PORT_ISSUE=0 + for PORT in $CONTAINER_PORTS; do - # Try nc first, then timeout + /dev/tcp, then curl if command -v nc >/dev/null 2>&1; then timeout 2 nc -z -w 1 "$POD_IP" "$PORT" >/dev/null 2>&1 CONNECTIVITY_EXIT_CODE=$? @@ -129,33 +138,83 @@ for POD_NAME in $PODS; do CONNECTIVITY_EXIT_CODE=$? else print_warning " Port $PORT: Cannot test (nc/timeout/curl not available in agent)" + PORT_RESULTS=$(echo "$PORT_RESULTS" | jq --argjson port "$PORT" '. + [{port: $port, status: "untestable"}]') continue fi if [[ $CONNECTIVITY_EXIT_CODE -eq 0 ]]; then print_success " Port $PORT: ✓ Listening" + PORT_RESULTS=$(echo "$PORT_RESULTS" | jq --argjson port "$PORT" '. + [{port: $port, status: "listening"}]') else - HAS_PORT_ISSUES=1 + CONTAINER_HAS_PORT_ISSUE=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error " Port $PORT: ✗ Declared but not listening or unreachable" print_action "Check application configuration and ensure it listens on port $PORT" + PORT_RESULTS=$(echo "$PORT_RESULTS" | jq --argjson port "$PORT" '. + [{port: $port, status: "not_listening"}]') + + # The container is running but not listening on its declared + # port — current logs likely show why (binding error, config + # mismatch, app stuck during startup). + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER_NAME" "current") + + ISSUE=$(jq -nc --arg pod "$POD_NAME" --arg c "$CONTAINER_NAME" --argjson port "$PORT" \ + --argjson logs "$CURRENT_LOGS" \ + '{pod: $pod, container: $c, port: $port, issue: "port_not_listening", container_logs: $logs}') + add_fact ISSUE_FACTS "$ISSUE" fi done + + CFACT=$(jq -nc --arg c "$CONTAINER_NAME" --argjson ports "$(facts_to_json_array PORT_RESULTS)" --arg ready "$CONTAINER_READY" \ + --argjson tested true \ + '{container: $c, status: "tested", container_ready: ($ready == "true"), ports: $ports}') + add_fact POD_CONTAINER_FACTS "$CFACT" done + + POD_FACT=$(jq -nc --arg p "$POD_NAME" --arg ip "$POD_IP" --argjson containers "$(facts_to_json_array POD_CONTAINER_FACTS)" \ + '{pod: $p, pod_ip: $ip, containers: $containers}') + add_fact POD_FACTS "$POD_FACT" done echo "" + +DETAILS=$(jq -nc \ + --argjson tested "$CONTAINERS_TESTED" \ + --argjson skipped "$CONTAINERS_SKIPPED" \ + --argjson facts "$(facts_to_json_array POD_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + '{ + containers_tested: $tested, + containers_skipped: $skipped, + issue_count: ($issues | length), + pods: $facts, + issues: $issues + }') + if [[ $CONTAINERS_TESTED -eq 0 ]]; then - # No containers were tested - all were skipped print_info "All containers skipped - no port checks could be performed" - update_check_result --status "skipped" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" -elif [[ $HAS_PORT_ISSUES -eq 0 ]]; then - # Some/all containers were tested and all passed + EVIDENCE=$(evidence_json \ + "All containers skipped — no port checks performed" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "skipped" --evidence "$EVIDENCE" +elif [[ $(echo "$ISSUE_FACTS" | jq 'length') -eq 0 ]]; then print_success "Port connectivity verified on $CONTAINERS_TESTED container(s)" - update_check_result --status "success" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + EVIDENCE=$(evidence_json \ + "Port connectivity verified on $CONTAINERS_TESTED container(s)" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - # Some containers were tested and had issues if [[ $CONTAINERS_SKIPPED -gt 0 ]]; then print_warning "Port issues found ($CONTAINERS_TESTED tested, $CONTAINERS_SKIPPED skipped)" fi - update_check_result --status "failed" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT pod(s) with port connectivity issues" + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Check application configuration and ensure it listens on declared ports"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/health_probe_endpoints b/k8s/diagnose/scope/health_probe_endpoints index a7bfd2c6..b8a32671 100755 --- a/k8s/diagnose/scope/health_probe_endpoints +++ b/k8s/diagnose/scope/health_probe_endpoints @@ -2,10 +2,8 @@ # Check: Health Probe Endpoints # Validates that liveness and readiness probe endpoints are configured and responding correctly -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') HAS_PROBE_ISSUES=0 @@ -13,27 +11,122 @@ HAS_PROBE_WARNINGS=0 CONTAINERS_TESTED=0 CONTAINERS_SKIPPED=0 +PROBE_FACTS=() +ISSUE_FACTS=() +POD_FACTS=() +AFFECTED_PODS="" + + +# Probe a single HTTP endpoint and emit a JSON fact + classification +# Args: pod, container, probe_type (Readiness/Liveness/Startup), path, port, scheme +test_http_probe() { + local pod="$1" container="$2" probe_type="$3" path="$4" port="$5" scheme="$6" + local url="${scheme,,}://$POD_IP:$port$path" + local response="" exit_code=0 + + if command -v curl >/dev/null 2>&1; then + if [[ "${scheme^^}" == "HTTPS" ]]; then + response=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$url" 2>&1) + else + response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$url" 2>&1) + fi + exit_code=$? + elif command -v wget >/dev/null 2>&1; then + if [[ "${scheme^^}" == "HTTPS" ]]; then + response=$(wget --no-check-certificate -O /dev/null --timeout=2 "$url" 2>&1) + else + response=$(wget -O /dev/null --timeout=2 "$url" 2>&1) + fi + exit_code=$? + if [[ $exit_code -eq 0 ]]; then + response="200" + else + local err + err=$(echo "$response" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) + if [[ -n "$err" ]]; then + response=$(echo "$err" | cut -c1-80) + else + response="wget failed with exit code $exit_code" + fi + fi + else + print_warning " $probe_type Probe on $scheme://$port$path: Cannot test (curl/wget not available in agent)" + PROBE_FACT=$(jq -nc --arg pod "$pod" --arg c "$container" --arg pt "$probe_type" \ + --arg path "$path" --arg port "$port" --arg scheme "$scheme" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "httpGet", path: $path, port: $port, scheme: $scheme, status: "untestable"}') + add_fact PROBE_FACTS "$PROBE_FACT" + return 2 + fi + + local result_status="" classification="" + if [[ $exit_code -eq 0 && "$response" =~ ^[2-3][0-9][0-9]$ ]]; then + print_success " $probe_type Probe on $scheme://$port$path: ✓ HTTP $response" + result_status="ok" + classification="success" + elif [[ "$response" =~ ^4[0-9][0-9]$ ]]; then + HAS_PROBE_ISSUES=1 + print_error " $probe_type Probe on $scheme://$port$path: ✗ HTTP $response - Health check endpoint not found" + result_status="endpoint_not_found" + classification="critical_4xx" + elif [[ "$response" =~ ^5[0-9][0-9]$ ]]; then + HAS_PROBE_WARNINGS=1 + print_warning " $probe_type Probe on $scheme://$port$path: ⚠ HTTP $response - Application error" + result_status="application_error" + classification="warning_5xx" + else + HAS_PROBE_WARNINGS=1 + print_warning " $probe_type Probe on $scheme://$port$path: ⚠ Connection failed (response: $response, exit code: $exit_code)" + result_status="connection_failed" + classification="warning_connection" + fi + + # On failure, attach the container's current logs to the issue so the AI + # summarizer can correlate "probe says X" with "app logs say Y" without + # any extra calls. + local container_logs="[]" + if [[ "$classification" != "success" ]]; then + container_logs=$(read_log_tail "$pod" "$container" "current") + fi + + PROBE_FACT=$(jq -nc --arg pod "$pod" --arg c "$container" --arg pt "$probe_type" \ + --arg path "$path" --arg port "$port" --arg scheme "$scheme" \ + --arg http_response "$response" --arg status "$result_status" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "httpGet", path: $path, port: $port, scheme: $scheme, http_response: $http_response, status: $status}') + add_fact PROBE_FACTS "$PROBE_FACT" + + if [[ "$classification" != "success" ]]; then + ISSUE=$(jq -nc --arg pod "$pod" --arg c "$container" --arg pt "$probe_type" \ + --arg path "$path" --arg port "$port" --arg http_response "$response" \ + --arg classification "$classification" \ + --argjson logs "$container_logs" \ + '{pod: $pod, container: $c, probe_type: $pt, path: $path, port: $port, http_response: $http_response, issue: $classification, container_logs: $logs}') + add_fact ISSUE_FACTS "$ISSUE" + return 1 + fi + return 0 +} + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) - # Check if pod is running POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') if [[ "$POD_PHASE" != "Running" ]]; then print_warning "Pod $POD_NAME: Not running (phase: $POD_PHASE), skipping probe checks" + FACT=$(jq -nc --arg p "$POD_NAME" --arg phase "$POD_PHASE" '{pod: $p, status: "skipped", reason: "not_running", phase: $phase}') + add_fact POD_FACTS "$FACT" continue fi - # Get pod IP POD_IP=$(echo "$POD_INFO" | jq -r '.status.podIP') if [[ -z "$POD_IP" || "$POD_IP" == "null" ]]; then print_warning "Pod $POD_NAME: No IP assigned, skipping probe checks" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, status: "skipped", reason: "no_ip"}') + add_fact POD_FACTS "$FACT" continue fi print_info "Checking pod $POD_NAME:" - # Get all containers CONTAINERS=$(echo "$POD_INFO" | jq -r '.spec.containers[] | @base64') for CONTAINER_B64 in $CONTAINERS; do @@ -42,7 +135,6 @@ for POD_NAME in $PODS; do print_info " Container '$CONTAINER_NAME':" - # Check container status before testing probes CONTAINER_STATUS=$(echo "$POD_INFO" | jq -r --arg name "$CONTAINER_NAME" '.status.containerStatuses[]? | select(.name == $name)') if [[ -z "$CONTAINER_STATUS" ]]; then @@ -50,7 +142,6 @@ for POD_NAME in $PODS; do continue fi - # Check if container is ready CONTAINER_READY=$(echo "$CONTAINER_STATUS" | jq -r '.ready') CONTAINER_STATE=$(echo "$CONTAINER_STATUS" | jq -r ' if .state.running then "running" @@ -60,13 +151,11 @@ for POD_NAME in $PODS; do end ') - # If container is not running, explain why we can't test probes if [[ "$CONTAINER_STATE" != "running" ]]; then if [[ "$CONTAINER_STATE" == "waiting" ]]; then WAITING_REASON=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.reason // "Unknown"') WAITING_MESSAGE=$(echo "$CONTAINER_STATUS" | jq -r '.state.waiting.message // ""') - # Check if it's a normal startup state or a problem case "$WAITING_REASON" in ContainerCreating|PodInitializing|Pulling) CONTAINERS_SKIPPED=$((CONTAINERS_SKIPPED + 1)) @@ -101,12 +190,10 @@ for POD_NAME in $PODS; do fi fi - # Container is running - check if it's ready if [[ "$CONTAINER_READY" != "true" ]]; then print_info " Container is running but not ready - probe checks may show why" fi - # Check if container has any probes configured HAS_READINESS=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe // empty') HAS_LIVENESS=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe // empty') HAS_STARTUP=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe // empty') @@ -116,286 +203,97 @@ for POD_NAME in $PODS; do continue fi - # Container has probes and is testable CONTAINERS_TESTED=$((CONTAINERS_TESTED + 1)) - # Track issues for this container to avoid repetitive action messages - CONTAINER_HAS_CONNECTION_ISSUES=0 - CONTAINER_HAS_4XX_ISSUES=0 - CONTAINER_HAS_5XX_ISSUES=0 - FAILED_PROBES_LIST="" - - # Check Readiness Probe - if [[ -n "$HAS_READINESS" ]]; then - PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r 'if .readinessProbe.httpGet then "httpGet" elif .readinessProbe.tcpSocket then "tcpSocket" elif .readinessProbe.exec then "exec" else "unknown" end') + # Test each probe type that's configured + for PROBE_KIND in readinessProbe livenessProbe startupProbe; do + local_has_probe="" + case "$PROBE_KIND" in + readinessProbe) local_has_probe="$HAS_READINESS"; PROBE_LABEL="Readiness" ;; + livenessProbe) local_has_probe="$HAS_LIVENESS"; PROBE_LABEL="Liveness" ;; + startupProbe) local_has_probe="$HAS_STARTUP"; PROBE_LABEL="Startup" ;; + esac - if [[ "$PROBE_TYPE" == "httpGet" ]]; then - PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.httpGet.path') - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.httpGet.port') - PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.httpGet.scheme // "HTTP"') - PROBE_URL="${PROBE_SCHEME,,}://$POD_IP:$PROBE_PORT$PROBE_PATH" - - # Try curl first from agent, then wget - if command -v curl >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - elif command -v wget >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(wget --no-check-certificate -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(wget -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - # Parse wget output to extract HTTP status or error - if [[ $PROBE_EXIT_CODE -eq 0 ]]; then - PROBE_RESPONSE="200" - else - # Extract error from wget output - try multiple patterns - ERROR_MSG=$(echo "$PROBE_RESPONSE" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) - if [[ -n "$ERROR_MSG" ]]; then - # Shorten the message if too long - PROBE_RESPONSE=$(echo "$ERROR_MSG" | cut -c1-80) - else - # If no specific error found, show exit code - PROBE_RESPONSE="wget failed with exit code $PROBE_EXIT_CODE" - fi - fi - else - print_warning " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: Cannot test (curl/wget not available in agent)" - continue - fi - - if [[ $PROBE_EXIT_CODE -eq 0 && "$PROBE_RESPONSE" =~ ^[2-3][0-9][0-9]$ ]]; then - print_success " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✓ HTTP $PROBE_RESPONSE" - else - # Probe failed - check if it's config issue or app issue - if [[ "$PROBE_RESPONSE" =~ ^4[0-9][0-9]$ ]]; then - # 4xx error: endpoint not found or bad config - HAS_PROBE_ISSUES=1 - CONTAINER_HAS_4XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Readiness" - print_error " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✗ HTTP $PROBE_RESPONSE - Health check endpoint not found" - elif [[ "$PROBE_RESPONSE" =~ ^5[0-9][0-9]$ ]]; then - # 5xx error: app has internal issues - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_5XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Readiness" - print_warning " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ HTTP $PROBE_RESPONSE - Application error" - else - # Connection failed or other error (port not listening, network issue, etc) - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_CONNECTION_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Readiness" - print_warning " Readiness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ Connection failed (response: $PROBE_RESPONSE, exit code: $PROBE_EXIT_CODE)" - fi - fi - elif [[ "$PROBE_TYPE" == "tcpSocket" ]]; then - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.tcpSocket.port') - print_info " Readiness Probe: TCP Socket on port $PROBE_PORT (tested in port health check)" - elif [[ "$PROBE_TYPE" == "exec" ]]; then - PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r '.readinessProbe.exec.command | join(" ")') - print_info " Readiness Probe: Exec [$PROBE_COMMAND] (cannot test directly)" - fi - fi + [[ -z "$local_has_probe" ]] && continue - # Check Liveness Probe - if [[ -n "$HAS_LIVENESS" ]]; then - PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r 'if .livenessProbe.httpGet then "httpGet" elif .livenessProbe.tcpSocket then "tcpSocket" elif .livenessProbe.exec then "exec" else "unknown" end') + PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" 'if .[$p].httpGet then "httpGet" elif .[$p].tcpSocket then "tcpSocket" elif .[$p].exec then "exec" else "unknown" end') if [[ "$PROBE_TYPE" == "httpGet" ]]; then - PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.httpGet.path') - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.httpGet.port') - PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.httpGet.scheme // "HTTP"') - PROBE_URL="${PROBE_SCHEME,,}://$POD_IP:$PROBE_PORT$PROBE_PATH" - - # Try curl first from agent, then wget - if command -v curl >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - elif command -v wget >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(wget --no-check-certificate -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(wget -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - # Parse wget output to extract HTTP status or error - if [[ $PROBE_EXIT_CODE -eq 0 ]]; then - PROBE_RESPONSE="200" - else - # Extract error from wget output - try multiple patterns - ERROR_MSG=$(echo "$PROBE_RESPONSE" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) - if [[ -n "$ERROR_MSG" ]]; then - # Shorten the message if too long - PROBE_RESPONSE=$(echo "$ERROR_MSG" | cut -c1-80) - else - # If no specific error found, show exit code - PROBE_RESPONSE="wget failed with exit code $PROBE_EXIT_CODE" - fi - fi - else - print_warning " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: Cannot test (curl/wget not available in agent)" - continue - fi - - if [[ $PROBE_EXIT_CODE -eq 0 && "$PROBE_RESPONSE" =~ ^[2-3][0-9][0-9]$ ]]; then - print_success " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✓ HTTP $PROBE_RESPONSE" - else - # Probe failed - check if it's config issue or app issue - if [[ "$PROBE_RESPONSE" =~ ^4[0-9][0-9]$ ]]; then - # 4xx error: endpoint not found or bad config - HAS_PROBE_ISSUES=1 - CONTAINER_HAS_4XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Liveness" - print_error " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✗ HTTP $PROBE_RESPONSE - Health check endpoint not found" - elif [[ "$PROBE_RESPONSE" =~ ^5[0-9][0-9]$ ]]; then - # 5xx error: app has internal issues - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_5XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Liveness" - print_warning " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ HTTP $PROBE_RESPONSE - Application error" - else - # Connection failed or other error (port not listening, network issue, etc) - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_CONNECTION_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Liveness" - print_warning " Liveness Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ Connection failed (response: $PROBE_RESPONSE, exit code: $PROBE_EXIT_CODE)" - fi + PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].httpGet.path') + PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].httpGet.port') + PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].httpGet.scheme // "HTTP"') + + test_http_probe "$POD_NAME" "$CONTAINER_NAME" "$PROBE_LABEL" "$PROBE_PATH" "$PROBE_PORT" "$PROBE_SCHEME" + rc=$? + if [[ $rc -eq 1 ]]; then + mark_affected AFFECTED_PODS "$POD_NAME" fi elif [[ "$PROBE_TYPE" == "tcpSocket" ]]; then - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.tcpSocket.port') - print_info " Liveness Probe: TCP Socket on port $PROBE_PORT (tested in port health check)" + PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].tcpSocket.port') + print_info " $PROBE_LABEL Probe: TCP Socket on port $PROBE_PORT (tested in port health check)" + PROBE_FACT=$(jq -nc --arg pod "$POD_NAME" --arg c "$CONTAINER_NAME" --arg pt "$PROBE_LABEL" --arg port "$PROBE_PORT" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "tcpSocket", port: $port, status: "not_tested_here"}') + add_fact PROBE_FACTS "$PROBE_FACT" elif [[ "$PROBE_TYPE" == "exec" ]]; then - PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r '.livenessProbe.exec.command | join(" ")') - print_info " Liveness Probe: Exec [$PROBE_COMMAND] (cannot test directly)" + PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r --arg p "$PROBE_KIND" '.[$p].exec.command | join(" ")') + print_info " $PROBE_LABEL Probe: Exec [$PROBE_COMMAND] (cannot test directly)" + PROBE_FACT=$(jq -nc --arg pod "$POD_NAME" --arg c "$CONTAINER_NAME" --arg pt "$PROBE_LABEL" --arg cmd "$PROBE_COMMAND" \ + '{pod: $pod, container: $c, probe_type: $pt, kind: "exec", command: $cmd, status: "untestable"}') + add_fact PROBE_FACTS "$PROBE_FACT" fi - fi - - # Check Startup Probe - if [[ -n "$HAS_STARTUP" ]]; then - PROBE_TYPE=$(echo "$CONTAINER_DATA" | jq -r 'if .startupProbe.httpGet then "httpGet" elif .startupProbe.tcpSocket then "tcpSocket" elif .startupProbe.exec then "exec" else "unknown" end') - - if [[ "$PROBE_TYPE" == "httpGet" ]]; then - PROBE_PATH=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.httpGet.path') - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.httpGet.port') - PROBE_SCHEME=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.httpGet.scheme // "HTTP"') - PROBE_URL="${PROBE_SCHEME,,}://$POD_IP:$PROBE_PORT$PROBE_PATH" - - # Try curl first from agent, then wget - if command -v curl >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - elif command -v wget >/dev/null 2>&1; then - if [[ "${PROBE_SCHEME^^}" == "HTTPS" ]]; then - PROBE_RESPONSE=$(wget --no-check-certificate -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - else - PROBE_RESPONSE=$(wget -O /dev/null --timeout=2 "$PROBE_URL" 2>&1) - fi - PROBE_EXIT_CODE=$? - # Parse wget output to extract HTTP status or error - if [[ $PROBE_EXIT_CODE -eq 0 ]]; then - PROBE_RESPONSE="200" - else - # Extract error from wget output - try multiple patterns - ERROR_MSG=$(echo "$PROBE_RESPONSE" | grep -iE "failed:|connection refused|timed? ?out|cannot connect|unable to|network|unreachable" | head -1) - if [[ -n "$ERROR_MSG" ]]; then - # Shorten the message if too long - PROBE_RESPONSE=$(echo "$ERROR_MSG" | cut -c1-80) - else - # If no specific error found, show exit code - PROBE_RESPONSE="wget failed with exit code $PROBE_EXIT_CODE" - fi - fi - else - print_warning " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: Cannot test (curl/wget not available in agent)" - continue - fi - - if [[ $PROBE_EXIT_CODE -eq 0 && "$PROBE_RESPONSE" =~ ^[2-3][0-9][0-9]$ ]]; then - print_success " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✓ HTTP $PROBE_RESPONSE" - else - # Probe failed - check if it's config issue or app issue - if [[ "$PROBE_RESPONSE" =~ ^4[0-9][0-9]$ ]]; then - # 4xx error: endpoint not found or bad config - HAS_PROBE_ISSUES=1 - CONTAINER_HAS_4XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Startup" - print_error " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ✗ HTTP $PROBE_RESPONSE - Health check endpoint not found" - elif [[ "$PROBE_RESPONSE" =~ ^5[0-9][0-9]$ ]]; then - # 5xx error: app has internal issues - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_5XX_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Startup" - print_warning " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ HTTP $PROBE_RESPONSE - Application error" - else - # Connection failed or other error (port not listening, network issue, etc) - HAS_PROBE_WARNINGS=1 - CONTAINER_HAS_CONNECTION_ISSUES=1 - FAILED_PROBES_LIST="$FAILED_PROBES_LIST Startup" - print_warning " Startup Probe on $PROBE_SCHEME://$PROBE_PORT$PROBE_PATH: ⚠ Connection failed (response: $PROBE_RESPONSE, exit code: $PROBE_EXIT_CODE)" - fi - fi - elif [[ "$PROBE_TYPE" == "tcpSocket" ]]; then - PROBE_PORT=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.tcpSocket.port') - print_info " Startup Probe: TCP Socket on port $PROBE_PORT" - elif [[ "$PROBE_TYPE" == "exec" ]]; then - PROBE_COMMAND=$(echo "$CONTAINER_DATA" | jq -r '.startupProbe.exec.command | join(" ")') - print_info " Startup Probe: Exec [$PROBE_COMMAND] (cannot test directly)" - fi - fi - - # Print consolidated action message for this container (avoid repetition) - if [[ -n "$FAILED_PROBES_LIST" ]]; then - echo "" - # Trim leading space from the list - FAILED_PROBES_LIST=$(echo "$FAILED_PROBES_LIST" | xargs) - - if [[ $CONTAINER_HAS_CONNECTION_ISSUES -eq 1 ]]; then - print_action "For $FAILED_PROBES_LIST probe(s): Verify port is listening and accessible from within cluster" - fi - - if [[ $CONTAINER_HAS_4XX_ISSUES -eq 1 ]]; then - print_action "For $FAILED_PROBES_LIST probe(s): Update probe path or implement the endpoint in application" - fi - - if [[ $CONTAINER_HAS_5XX_ISSUES -eq 1 ]]; then - print_action "For $FAILED_PROBES_LIST probe(s): Check application logs and fix internal errors or dependencies" - fi - fi + done done done echo "" + +DETAILS=$(jq -nc \ + --argjson tested "$CONTAINERS_TESTED" \ + --argjson skipped "$CONTAINERS_SKIPPED" \ + --argjson probes "$(facts_to_json_array PROBE_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + '{ + containers_tested: $tested, + containers_skipped: $skipped, + probe_results: $probes, + issue_count: ($issues | length), + issues: $issues + }') + if [[ $CONTAINERS_TESTED -eq 0 ]]; then - # No containers were tested - all were skipped print_info "All containers skipped - no probe checks could be performed" - update_check_result --status "skipped" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + EVIDENCE=$(evidence_json \ + "All containers skipped — no probe checks performed" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "skipped" --evidence "$EVIDENCE" elif [[ $HAS_PROBE_ISSUES -gt 0 ]]; then - # Some containers were tested and had issues if [[ $CONTAINERS_SKIPPED -gt 0 ]]; then print_warning "Probe issues found ($CONTAINERS_TESTED tested, $CONTAINERS_SKIPPED skipped)" fi - update_check_result --status "failed" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="Probe failures detected in $AFFECTED_COUNT pod(s)" + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Update probe path or implement health endpoint in application", "Verify port is listening and accessible"]') + update_check_result --status "failed" --evidence "$EVIDENCE" elif [[ $HAS_PROBE_WARNINGS -gt 0 ]]; then - # Some containers were tested and had warnings if [[ $CONTAINERS_SKIPPED -gt 0 ]]; then print_info "Probe warnings found ($CONTAINERS_TESTED tested, $CONTAINERS_SKIPPED skipped)" fi - update_check_result --status "warning" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="Probe warnings in $AFFECTED_COUNT pod(s) — application or connectivity issues" + EVIDENCE=$(evidence_json "$SUMMARY" "warning" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Check application logs for internal errors", "Verify port is listening and accessible"]') + update_check_result --status "warning" --evidence "$EVIDENCE" else - # All tested containers passed print_success "Health probes verified on $CONTAINERS_TESTED container(s)" - update_check_result --status "success" --evidence "{\"tested\":$CONTAINERS_TESTED,\"skipped\":$CONTAINERS_SKIPPED}" + EVIDENCE=$(evidence_json \ + "Health probes verified on $CONTAINERS_TESTED container(s)" \ + "info" \ + "[]" \ + "$DETAILS" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/image_pull_status b/k8s/diagnose/scope/image_pull_status index 31e9d340..1bdfe38e 100644 --- a/k8s/diagnose/scope/image_pull_status +++ b/k8s/diagnose/scope/image_pull_status @@ -2,38 +2,63 @@ # Check: Image Pull Status # Verifies container images can be pulled from registry -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ERRORS=0 +PULL_FAILURES=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) IMAGE_PULL_ERRORS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.waiting.reason == "ImagePullBackOff" or .state.waiting.reason == "ErrImagePull") | .name') if [[ -n "$IMAGE_PULL_ERRORS" ]]; then - HAS_ERRORS=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: ImagePullBackOff/ErrImagePull in container(s): $IMAGE_PULL_ERRORS" for CONTAINER in $IMAGE_PULL_ERRORS; do IMAGE=$(echo "$POD_INFO" | jq -r ".spec.containers[] | select(.name==\"$CONTAINER\") | .image") - MESSAGE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.waiting.message") + MESSAGE=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.waiting.message // \"\"") + REASON=$(echo "$POD_INFO" | jq -r ".status.containerStatuses[] | select(.name==\"$CONTAINER\") | .state.waiting.reason // \"\"") print_warning " Image: $IMAGE" print_warning " Reason: $MESSAGE" + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --arg image "$IMAGE" \ + --arg reason "$REASON" \ + --arg message "$MESSAGE" \ + '{pod: $pod, container: $container, image: $image, reason: $reason, message: $message}') + add_fact PULL_FAILURES "$FACT" done print_action "Verify image exists and imagePullSecrets are configured for private registries" fi done -if [[ $HAS_ERRORS -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +FAIL_COUNT=$(echo "$PULL_FAILURES" | jq 'length') + +if [[ $FAIL_COUNT -eq 0 ]]; then print_success "All $POD_COUNT pod(s) have images pulled successfully" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $POD_COUNT pod(s) have images pulled successfully" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) failing to pull images" + DETAILS=$(jq -nc \ + --argjson failures "$(facts_to_json_array PULL_FAILURES)" \ + --argjson pod_count "$POD_COUNT" \ + '{pod_count: $pod_count, failure_count: ($failures | length), failures: $failures}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Verify image exists and imagePullSecrets are configured for private registries"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/scope/memory_limits_check b/k8s/diagnose/scope/memory_limits_check index 90e280b8..af0cc0b4 100644 --- a/k8s/diagnose/scope/memory_limits_check +++ b/k8s/diagnose/scope/memory_limits_check @@ -2,22 +2,21 @@ # Check: Memory Limits # Checks for out-of-memory container terminations -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_OOM=0 +OOM_FACTS=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) OOM_KILLED=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.lastState.terminated.reason == "OOMKilled") | .name') if [[ -n "$OOM_KILLED" ]]; then - HAS_OOM=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: OOMKilled in container(s): $OOM_KILLED" for CONTAINER in $OOM_KILLED; do @@ -28,14 +27,50 @@ for POD_NAME in $PODS; do print_warning " Memory Limit: $MEMORY_LIMIT" print_warning " Memory Request: $MEMORY_REQUEST" print_action "Increase memory limits or optimize application memory usage" + + # When OOM kills a container, the previous-instance logs are where + # the last application output lives — the current instance was + # restarted by the kubelet after the kill. + PREVIOUS_LOGS=$(read_log_tail "$POD_NAME" "$CONTAINER" "previous") + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg container "$CONTAINER" \ + --arg memory_limit "$MEMORY_LIMIT" \ + --arg memory_request "$MEMORY_REQUEST" \ + --argjson previous_logs "$PREVIOUS_LOGS" \ + '{ + pod: $pod, + container: $container, + memory_limit: $memory_limit, + memory_request: $memory_request, + previous_logs: $previous_logs + }') + add_fact OOM_FACTS "$FACT" done fi done -if [[ $HAS_OOM -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +OOM_COUNT=$(echo "$OOM_FACTS" | jq 'length') + +if [[ $OOM_COUNT -eq 0 ]]; then print_success "No OOMKilled containers detected in $POD_COUNT pod(s)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "No OOMKilled containers detected in $POD_COUNT pod(s)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) had OOMKilled containers" + DETAILS=$(jq -nc \ + --argjson oom "$(facts_to_json_array OOM_FACTS)" \ + --argjson pod_count "$POD_COUNT" \ + '{pod_count: $pod_count, oom_killed_count: ($oom | length), oom_killed: $oom}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Increase memory limits or optimize application memory usage"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/pod_existence b/k8s/diagnose/scope/pod_existence index b9919a9e..ffc3df34 100644 --- a/k8s/diagnose/scope/pod_existence +++ b/k8s/diagnose/scope/pod_existence @@ -7,10 +7,26 @@ PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$PODS" ]]; then print_error "No pods found with labels $LABEL_SELECTOR in namespace $NAMESPACE" print_action "Check deployment status and verify label selectors match" - update_check_result --status "failed" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "No pods found in namespace $NAMESPACE" \ + "critical" \ + "[]" \ + "$(jq -nc --arg ls "$LABEL_SELECTOR" --arg ns "$NAMESPACE" '{label_selector: $ls, namespace: $ns}')" \ + '["Check deployment status and verify label selectors match"]') + update_check_result --status "failed" --evidence "$EVIDENCE" return 1 fi -PODS_COUNT=$(echo "$PODS" | wc -w) +POD_NAMES_JSON=$(jq -c '[.items[].metadata.name]' "$PODS_FILE" 2>/dev/null) +PODS_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') print_success "Found $PODS_COUNT pod(s): $PODS" -update_check_result --status "success" --evidence "{}" + +EVIDENCE=$(evidence_json \ + "Found $PODS_COUNT pod(s) in namespace $NAMESPACE" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$PODS_COUNT" --argjson names "$POD_NAMES_JSON" --arg ns "$NAMESPACE" \ + '{pod_count: $count, pod_names: $names, namespace: $ns}')" \ + "[]") +update_check_result --status "success" --evidence "$EVIDENCE" diff --git a/k8s/diagnose/scope/pod_readiness b/k8s/diagnose/scope/pod_readiness index eecdcb32..798a973c 100644 --- a/k8s/diagnose/scope/pod_readiness +++ b/k8s/diagnose/scope/pod_readiness @@ -2,10 +2,8 @@ # Check: Pod Readiness # Confirms pod is running and ready to serve traffic -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') # Counters for summary @@ -16,46 +14,49 @@ NOT_READY_PODS=0 TERMINATING_PODS=0 STARTING_PODS=0 -# Deployment state detection HAS_TERMINATING_PODS=0 HAS_STARTING_PODS=0 +POD_FACTS=() +AFFECTED_PODS="" + + for POD_NAME in $PODS; do TOTAL_PODS=$((TOTAL_PODS + 1)) - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') - POD_READY=$(echo "$POD_INFO" | jq -r '.status.conditions[] | select(.type=="Ready") | .status') + POD_READY=$(echo "$POD_INFO" | jq -r '.status.conditions[]? | select(.type=="Ready") | .status') - # Check if pod is terminating DELETION_TIMESTAMP=$(echo "$POD_INFO" | jq -r '.metadata.deletionTimestamp // empty') if [[ -n "$DELETION_TIMESTAMP" ]]; then TERMINATING_PODS=$((TERMINATING_PODS + 1)) HAS_TERMINATING_PODS=1 print_info "Pod $POD_NAME: Terminating (rollout in progress)" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, state: "terminating"}') + add_fact POD_FACTS "$FACT" continue fi if [[ "$POD_PHASE" == "Running" && "$POD_READY" == "True" ]]; then READY_PODS=$((READY_PODS + 1)) print_success "Pod $POD_NAME: Running and Ready" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, state: "ready", phase: "Running"}') + add_fact POD_FACTS "$FACT" elif [[ "$POD_PHASE" == "Succeeded" ]]; then SUCCEEDED_PODS=$((SUCCEEDED_PODS + 1)) print_success "Pod $POD_NAME: Completed successfully" + FACT=$(jq -nc --arg p "$POD_NAME" '{pod: $p, state: "succeeded", phase: "Succeeded"}') + add_fact POD_FACTS "$FACT" else NOT_READY_PODS=$((NOT_READY_PODS + 1)) - - # Detect if pod is in normal startup state and collect reasons IS_STARTING=0 STARTUP_INFO="" - # Check if pod is in Pending phase (normal during startup) if [[ "$POD_PHASE" == "Pending" ]]; then IS_STARTING=1 fi - # Check init containers first INIT_CONTAINER_INFO=$(echo "$POD_INFO" | jq -r ' .status.initContainerStatuses[]? | select(.state.waiting or .state.running) | @@ -71,7 +72,6 @@ for POD_NAME in $PODS; do STARTUP_INFO="Init: $(echo "$INIT_CONTAINER_INFO" | paste -sd ',' - | sed 's/,/, /g')" fi - # Check for normal container startup reasons with details CONTAINER_STARTUP_INFO=$(echo "$POD_INFO" | jq -r ' .status.containerStatuses[]? | select(.state.waiting) | @@ -79,7 +79,6 @@ for POD_NAME in $PODS; do ' 2>/dev/null) if [[ -n "$CONTAINER_STARTUP_INFO" ]]; then - # Check if any are normal startup reasons while IFS= read -r CONTAINER_LINE; do REASON=$(echo "$CONTAINER_LINE" | cut -d':' -f2 | xargs) case "$REASON" in @@ -97,32 +96,32 @@ for POD_NAME in $PODS; do fi fi + POD_STATE="not_ready" if [[ $IS_STARTING -eq 1 ]]; then STARTING_PODS=$((STARTING_PODS + 1)) HAS_STARTING_PODS=1 + POD_STATE="starting" if [[ -n "$STARTUP_INFO" ]]; then print_info "Pod $POD_NAME: Starting up - $STARTUP_INFO" else print_info "Pod $POD_NAME: Phase=$POD_PHASE (starting up)" fi else + mark_affected AFFECTED_PODS "$POD_NAME" print_warning "Pod $POD_NAME: Phase=$POD_PHASE, Ready=$POD_READY" fi - # Get detailed condition information - READY_CONDITION=$(echo "$POD_INFO" | jq -r '.status.conditions[] | select(.type=="Ready")') + READY_CONDITION=$(echo "$POD_INFO" | jq -c '.status.conditions[]? | select(.type=="Ready")' | head -1) READY_REASON=$(echo "$READY_CONDITION" | jq -r '.reason // "Unknown"') READY_MESSAGE=$(echo "$READY_CONDITION" | jq -r '.message // "No message available"') if [[ -n "$READY_REASON" && "$READY_REASON" != "Unknown" ]]; then print_warning " Reason: $READY_REASON" fi - if [[ -n "$READY_MESSAGE" && "$READY_MESSAGE" != "No message available" ]]; then print_warning " Message: $READY_MESSAGE" fi - # Check container statuses CONTAINER_STATUSES=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | "\(.name): Ready=\(.ready), Restarts=\(.restartCount)"' 2>/dev/null) if [[ -n "$CONTAINER_STATUSES" ]]; then @@ -132,7 +131,6 @@ for POD_NAME in $PODS; do done <<< "$CONTAINER_STATUSES" fi - # Check for waiting containers with reasons WAITING_CONTAINERS=$(echo "$POD_INFO" | jq -r '.status.containerStatuses[]? | select(.state.waiting) | " \(.name): \(.state.waiting.reason) - \(.state.waiting.message // "No details")"' 2>/dev/null) if [[ -n "$WAITING_CONTAINERS" ]]; then @@ -142,31 +140,93 @@ for POD_NAME in $PODS; do done fi - # Only show action if not in normal startup state if [[ $IS_STARTING -eq 0 ]]; then print_action "Check application health endpoint and ensure dependencies are available" fi + + # For pods stuck in "not_ready" (not just starting), embed current + # logs of the first container so the AI can correlate readiness + # failure with what the app printed. We skip starting pods because + # their logs are still in flight and not informative yet. + POD_LOGS_PER_CONTAINER="[]" + if [[ "$POD_STATE" == "not_ready" ]]; then + # Iterate first regular container only — keep payload bounded + FIRST_CONTAINER=$(echo "$POD_INFO" | jq -r '.spec.containers[0].name // empty') + if [[ -n "$FIRST_CONTAINER" ]]; then + CURRENT_LOGS=$(read_log_tail "$POD_NAME" "$FIRST_CONTAINER" "current") + POD_LOGS_PER_CONTAINER=$(jq -nc --arg c "$FIRST_CONTAINER" --argjson logs "$CURRENT_LOGS" \ + '[{container: $c, current_logs: $logs}]') + fi + fi + + FACT=$(jq -nc --arg p "$POD_NAME" --arg state "$POD_STATE" --arg phase "$POD_PHASE" --arg ready "$POD_READY" \ + --arg reason "$READY_REASON" --arg message "$READY_MESSAGE" --arg startup "$STARTUP_INFO" \ + --argjson container_logs "$POD_LOGS_PER_CONTAINER" \ + '{pod: $p, state: $state, phase: $phase, ready: $ready, reason: $reason, message: $message, startup_info: $startup, container_logs: $container_logs}') + add_fact POD_FACTS "$FACT" fi done -# Print summary echo "" + +DETAILS_BASE=$(jq -nc \ + --argjson total "$TOTAL_PODS" \ + --argjson ready "$READY_PODS" \ + --argjson succeeded "$SUCCEEDED_PODS" \ + --argjson not_ready "$NOT_READY_PODS" \ + --argjson terminating "$TERMINATING_PODS" \ + --argjson starting "$STARTING_PODS" \ + --argjson facts "$(facts_to_json_array POD_FACTS)" \ + '{ + total: $total, + ready: $ready, + succeeded: $succeeded, + not_ready: $not_ready, + terminating: $terminating, + starting: $starting, + pods: $facts + }') + if [[ $TOTAL_PODS -eq 0 ]]; then print_warning "No pods found" - update_check_result --status "failed" --evidence "{\"ready\":0,\"total\":0}" + EVIDENCE=$(evidence_json "No pods found" "critical" "[]" "$DETAILS_BASE" '["Check deployment and label selectors"]') + update_check_result --status "failed" --evidence "$EVIDENCE" elif [[ $READY_PODS -eq $TOTAL_PODS ]] || [[ $((READY_PODS + SUCCEEDED_PODS)) -eq $TOTAL_PODS ]]; then print_success "All pods ready: $READY_PODS/$TOTAL_PODS running and ready" - update_check_result --status "success" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS}" + EVIDENCE=$(evidence_json \ + "All $TOTAL_PODS pod(s) ready ($READY_PODS running, $SUCCEEDED_PODS succeeded)" \ + "info" \ + "[]" \ + "$DETAILS_BASE" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" elif [[ $HAS_TERMINATING_PODS -eq 1 ]]; then - # Pods are terminating - deployment/rollout in progress print_info "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready (rollout in progress with terminating pods)" - update_check_result --status "warning" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS,\"terminating\":$TERMINATING_PODS,\"deployment_in_progress\":true}" + DETAILS=$(echo "$DETAILS_BASE" | jq '. + {deployment_in_progress: true}') + EVIDENCE=$(evidence_json \ + "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready, $TERMINATING_PODS terminating" \ + "warning" \ + "[]" \ + "$DETAILS" \ + '["Wait for rollout to complete"]') + update_check_result --status "warning" --evidence "$EVIDENCE" elif [[ $HAS_STARTING_PODS -eq 1 ]]; then - # Pods are starting up normally - new deployment in progress print_info "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready, $STARTING_PODS starting up" - update_check_result --status "warning" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS,\"starting\":$STARTING_PODS,\"not_ready\":$NOT_READY_PODS,\"deployment_in_progress\":true}" + DETAILS=$(echo "$DETAILS_BASE" | jq '. + {deployment_in_progress: true}') + EVIDENCE=$(evidence_json \ + "Deployment in progress: $READY_PODS/$TOTAL_PODS pods ready, $STARTING_PODS starting" \ + "warning" \ + "[]" \ + "$DETAILS" \ + '["Wait for pods to finish starting"]') + update_check_result --status "warning" --evidence "$EVIDENCE" else - # Some pods not ready and no clear sign of deployment in progress - this is a problem print_error "Pods not ready: $READY_PODS/$TOTAL_PODS ready (pods have issues)" - update_check_result --status "failed" --evidence "{\"ready\":$READY_PODS,\"total\":$TOTAL_PODS}" -fi \ No newline at end of file + EVIDENCE=$(evidence_json \ + "$READY_PODS/$TOTAL_PODS pods ready — $NOT_READY_PODS pod(s) have issues" \ + "critical" \ + "$(set_to_json_array AFFECTED_PODS)" \ + "$DETAILS_BASE" \ + '["Check application health endpoint and ensure dependencies are available"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/scope/resource_availability b/k8s/diagnose/scope/resource_availability index 53e99f97..55fa4ea5 100644 --- a/k8s/diagnose/scope/resource_availability +++ b/k8s/diagnose/scope/resource_availability @@ -2,16 +2,17 @@ # Check: Resource Availability # Validates pod can be scheduled with requested resources -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +UNSCHEDULABLE_FACTS=() +AFFECTED_PODS="" +HAS_INSUFFICIENT_CPU=0 +HAS_INSUFFICIENT_MEMORY=0 + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) POD_PHASE=$(echo "$POD_INFO" | jq -r '.status.phase') @@ -19,27 +20,73 @@ for POD_NAME in $PODS; do UNSCHEDULABLE=$(echo "$POD_INFO" | jq -r '.status.conditions[] | select(.reason=="Unschedulable") | .message') if [[ -n "$UNSCHEDULABLE" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: Cannot be scheduled" print_warning " Reason: $UNSCHEDULABLE" + POD_HAS_CPU_ISSUE=0 + POD_HAS_MEMORY_ISSUE=0 if echo "$UNSCHEDULABLE" | grep -qi "insufficient cpu"; then print_warning " Issue: Insufficient CPU in cluster" + HAS_INSUFFICIENT_CPU=1 + POD_HAS_CPU_ISSUE=1 fi - if echo "$UNSCHEDULABLE" | grep -qi "insufficient memory"; then print_warning " Issue: Insufficient memory in cluster" + HAS_INSUFFICIENT_MEMORY=1 + POD_HAS_MEMORY_ISSUE=1 fi print_action "Reduce resource requests or add more nodes to cluster" + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg reason "$UNSCHEDULABLE" \ + --argjson insufficient_cpu "$POD_HAS_CPU_ISSUE" \ + --argjson insufficient_memory "$POD_HAS_MEMORY_ISSUE" \ + '{pod: $pod, reason: $reason, insufficient_cpu: ($insufficient_cpu == 1), insufficient_memory: ($insufficient_memory == 1)}') + add_fact UNSCHEDULABLE_FACTS "$FACT" fi fi done -if [[ $HAS_ISSUES -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$UNSCHEDULABLE_FACTS" | jq 'length') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All $POD_COUNT pod(s) successfully scheduled with sufficient resources" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All $POD_COUNT pod(s) successfully scheduled" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY_PARTS=() + [[ $HAS_INSUFFICIENT_CPU -eq 1 ]] && SUMMARY_PARTS+=("insufficient CPU") + [[ $HAS_INSUFFICIENT_MEMORY -eq 1 ]] && SUMMARY_PARTS+=("insufficient memory") + if [[ ${#SUMMARY_PARTS[@]} -gt 0 ]]; then + SUMMARY_DETAIL=$(IFS=", "; echo "${SUMMARY_PARTS[*]}") + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) unschedulable — $SUMMARY_DETAIL" + else + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) unschedulable" + fi + + DETAILS=$(jq -nc \ + --argjson unscheduled "$(facts_to_json_array UNSCHEDULABLE_FACTS)" \ + --argjson pod_count "$POD_COUNT" \ + --argjson cpu "$HAS_INSUFFICIENT_CPU" \ + --argjson mem "$HAS_INSUFFICIENT_MEMORY" \ + '{ + pod_count: $pod_count, + unschedulable_count: ($unscheduled | length), + cluster_insufficient_cpu: ($cpu == 1), + cluster_insufficient_memory: ($mem == 1), + unschedulable: $unscheduled + }') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Reduce resource requests or add more nodes to cluster"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/scope/storage_mounting b/k8s/diagnose/scope/storage_mounting index 09e17d16..c3c9a2dc 100644 --- a/k8s/diagnose/scope/storage_mounting +++ b/k8s/diagnose/scope/storage_mounting @@ -2,16 +2,15 @@ # Check: Storage Mounting # Verifies persistent volumes are bound and mounted -# Validate pods exist require_pods || return 0 -# Read pods from pre-collected data PODS=$(jq -r '.items[].metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') -HAS_STORAGE_ISSUES=0 +PVC_FACTS=() +AFFECTED_PODS="" + for POD_NAME in $PODS; do - # Get pod info from pre-collected data POD_INFO=$(jq --arg name "$POD_NAME" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) PVCS=$(echo "$POD_INFO" | jq -r '.spec.volumes[]? | select(.persistentVolumeClaim) | .persistentVolumeClaim.claimName') @@ -21,20 +20,36 @@ for POD_NAME in $PODS; do PVC_STATUS=$(kubectl get pvc "$PVC" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null) if [[ "$PVC_STATUS" == "Pending" ]]; then - HAS_STORAGE_ISSUES=1 + mark_affected AFFECTED_PODS "$POD_NAME" print_error "Pod $POD_NAME: PVC $PVC is in Pending state" PVC_INFO=$(kubectl get pvc "$PVC" -n "$NAMESPACE" -o json 2>/dev/null) STORAGE_CLASS=$(echo "$PVC_INFO" | jq -r '.spec.storageClassName // "default"') - REQUESTED_SIZE=$(echo "$PVC_INFO" | jq -r '.spec.resources.requests.storage') + REQUESTED_SIZE=$(echo "$PVC_INFO" | jq -r '.spec.resources.requests.storage // ""') print_warning " Storage Class: $STORAGE_CLASS" print_warning " Requested Size: $REQUESTED_SIZE" print_action "Check if StorageClass exists and has available capacity" + + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg pvc "$PVC" \ + --arg status "$PVC_STATUS" \ + --arg storage_class "$STORAGE_CLASS" \ + --arg requested_size "$REQUESTED_SIZE" \ + '{pod: $pod, pvc: $pvc, status: $status, storage_class: $storage_class, requested_size: $requested_size}') + add_fact PVC_FACTS "$FACT" elif [[ "$PVC_STATUS" == "Bound" ]]; then print_success "Pod $POD_NAME: PVC $PVC is Bound" else print_warning "Pod $POD_NAME: PVC $PVC status is $PVC_STATUS" + mark_affected AFFECTED_PODS "$POD_NAME" + FACT=$(jq -nc \ + --arg pod "$POD_NAME" \ + --arg pvc "$PVC" \ + --arg status "${PVC_STATUS:-Unknown}" \ + '{pod: $pod, pvc: $pvc, status: $status}') + add_fact PVC_FACTS "$FACT" fi done fi @@ -47,10 +62,26 @@ for POD_NAME in $PODS; do fi done -if [[ $HAS_STORAGE_ISSUES -eq 0 ]]; then - POD_COUNT=$(echo "$PODS" | wc -w) +POD_COUNT=$(echo "$PODS" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$PVC_FACTS" | jq 'length') + +if [[ $ISSUE_COUNT -eq 0 ]]; then print_success "All volumes mounted successfully for $POD_COUNT pod(s)" - update_check_result --status "success" --evidence "{}" + EVIDENCE=$(evidence_json \ + "All volumes mounted successfully for $POD_COUNT pod(s)" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$POD_COUNT" '{pods_checked: $count}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" + AFFECTED_COUNT=$(echo "$AFFECTED_PODS" | wc -w | tr -d ' ') + SUMMARY="$AFFECTED_COUNT of $POD_COUNT pod(s) have storage issues" + DETAILS=$(jq -nc \ + --argjson pvcs "$(facts_to_json_array PVC_FACTS)" \ + --argjson pod_count "$POD_COUNT" \ + '{pod_count: $pod_count, pvc_issue_count: ($pvcs | length), pvc_issues: $pvcs}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_PODS)" "$DETAILS" \ + '["Check if StorageClass exists and has available capacity"]') + update_check_result --status "failed" --evidence "$EVIDENCE" fi diff --git a/k8s/diagnose/service/service_endpoints b/k8s/diagnose/service/service_endpoints index a6fe12c5..72e72a6a 100644 --- a/k8s/diagnose/service/service_endpoints +++ b/k8s/diagnose/service/service_endpoints @@ -2,50 +2,53 @@ # Check: Service Endpoints # Checks if service has healthy endpoints -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ENDPOINT_FACTS=() +AFFECTED_SERVICES="" +NO_ENDPOINTS_RESOURCE=0 +NO_READY_COUNT=0 + for SERVICE_NAME in $SERVICES; do - # Get endpoints from pre-collected data ENDPOINTS_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null) if [[ -z "$ENDPOINTS_INFO" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" + NO_ENDPOINTS_RESOURCE=$((NO_ENDPOINTS_RESOURCE + 1)) print_error "Service $SERVICE_NAME: No endpoints resource found" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" \ + '{service: $svc, issue: "no_endpoints_resource", ready_count: 0, not_ready_count: 0}') + add_fact ENDPOINT_FACTS "$FACT" continue fi - # Check if endpoints has any addresses with detailed info - # Get ports for this subset PORTS=$(echo "$ENDPOINTS_INFO" | jq -r '.subsets[0]?.ports[]? | "\(.port):\(.name // "unnamed")"' 2>/dev/null | head -1) PORT_NUMBER=$(echo "$PORTS" | cut -d':' -f1) - PORT_NAME=$(echo "$PORTS" | cut -d':' -f2) - READY_ENDPOINTS=$(echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.addresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) - NOT_READY_ENDPOINTS=$(echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.notReadyAddresses[]? | "\(.targetRef.name // "unknown"):\(.ip)"' 2>/dev/null) + READY_COUNT=$(echo "$ENDPOINTS_INFO" | jq -r '[.subsets[]?.addresses[]?] | length' 2>/dev/null) + NOT_READY_COUNT=$(echo "$ENDPOINTS_INFO" | jq -r '[.subsets[]?.notReadyAddresses[]?] | length' 2>/dev/null) + READY_COUNT=${READY_COUNT:-0} + NOT_READY_COUNT=${NOT_READY_COUNT:-0} - READY_COUNT=$(echo "$READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) - NOT_READY_COUNT=$(echo "$NOT_READY_ENDPOINTS" | grep -c '^' 2>/dev/null || echo 0) + READY_LIST=$(echo "$ENDPOINTS_INFO" | jq -c '[.subsets[]?.addresses[]? | {pod: (.targetRef.name // "unknown"), ip: .ip}]' 2>/dev/null) + NOT_READY_LIST=$(echo "$ENDPOINTS_INFO" | jq -c '[.subsets[]?.notReadyAddresses[]? | {pod: (.targetRef.name // "unknown"), ip: .ip}]' 2>/dev/null) if [[ $READY_COUNT -eq 0 ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" + NO_READY_COUNT=$((NO_READY_COUNT + 1)) print_error "Service $SERVICE_NAME: No ready endpoints available" if [[ $NOT_READY_COUNT -gt 0 ]]; then - print_warning " Not ready endpoints: $NOT_READY_COUNT" - # Show details of not ready endpoints - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - if [[ -n "$IP" ]]; then - if [[ -n "$PORT_NUMBER" ]]; then - print_warning " - $POD_NAME -> $IP:$PORT_NUMBER" - else - print_warning " - $POD_NAME -> $IP" - fi + print_warning " $NOT_READY_COUNT not ready endpoint(s):" + echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.notReadyAddresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)"' | while IFS= read -r line; do + if [[ -n "$PORT_NUMBER" ]]; then + print_warning "${line}:${PORT_NUMBER}" + else + print_warning "$line" fi done print_action "Check pod readiness probes and pod status" @@ -53,39 +56,61 @@ for SERVICE_NAME in $SERVICES; do print_warning " No endpoints at all" print_action "Verify service selector matches pod labels" fi + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" \ + --argjson ready_count "$READY_COUNT" --argjson not_ready_count "$NOT_READY_COUNT" \ + --argjson not_ready "$NOT_READY_LIST" \ + '{service: $svc, issue: "no_ready_endpoints", ready_count: $ready_count, not_ready_count: $not_ready_count, not_ready_endpoints: $not_ready}') + add_fact ENDPOINT_FACTS "$FACT" else print_success "Service $SERVICE_NAME: $READY_COUNT ready endpoint(s)" - - # Show details of ready endpoints - echo "$READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - if [[ -n "$IP" ]]; then - if [[ -n "$PORT_NUMBER" ]]; then - print_success " - $POD_NAME -> $IP:$PORT_NUMBER" - else - print_success " - $POD_NAME -> $IP" - fi + echo "$ENDPOINTS_INFO" | jq -r '.subsets[]?.addresses[]? | " - \(.targetRef.name // "unknown") -> \(.ip)"' | while IFS= read -r line; do + if [[ -n "$PORT_NUMBER" ]]; then + print_success "${line}:${PORT_NUMBER}" + else + print_success "$line" fi done if [[ $NOT_READY_COUNT -gt 0 ]]; then print_warning " Also has $NOT_READY_COUNT not ready endpoint(s)" - # Show details of not ready endpoints - echo "$NOT_READY_ENDPOINTS" | while IFS=':' read -r POD_NAME IP; do - if [[ -n "$IP" ]]; then - if [[ -n "$PORT_NUMBER" ]]; then - print_warning " - $POD_NAME -> $IP:$PORT_NUMBER" - else - print_warning " - $POD_NAME -> $IP" - fi - fi - done print_action "Check pod readiness probes and pod status" fi + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" \ + --argjson ready_count "$READY_COUNT" --argjson not_ready_count "$NOT_READY_COUNT" \ + --argjson ready "$READY_LIST" --argjson not_ready "$NOT_READY_LIST" \ + '{service: $svc, ready_count: $ready_count, not_ready_count: $not_ready_count, ready_endpoints: $ready, not_ready_endpoints: $not_ready}') + add_fact ENDPOINT_FACTS "$FACT" fi done -if [[ $HAS_ISSUES -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) have ready endpoints" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array ENDPOINT_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) without ready endpoints" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array ENDPOINT_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + --argjson no_resource "$NO_ENDPOINTS_RESOURCE" \ + --argjson no_ready "$NO_READY_COUNT" \ + '{ + service_count: $count, + issue_count: ($facts | map(select(.issue != null)) | length), + no_endpoints_resource_count: $no_resource, + no_ready_endpoints_count: $no_ready, + services: $facts + }') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Check pod readiness probes and verify service selector matches pod labels"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/service/service_existence b/k8s/diagnose/service/service_existence index 2ee8783f..d57e3d4f 100644 --- a/k8s/diagnose/service/service_existence +++ b/k8s/diagnose/service/service_existence @@ -7,10 +7,26 @@ SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' if [[ -z "$SERVICES" ]]; then print_error "No services found with labels $LABEL_SELECTOR in namespace $NAMESPACE" print_action "Create service resource or verify label selectors" - update_check_result --status "failed" --evidence "{}" + + EVIDENCE=$(evidence_json \ + "No services found in namespace $NAMESPACE" \ + "critical" \ + "[]" \ + "$(jq -nc --arg ls "$LABEL_SELECTOR" --arg ns "$NAMESPACE" '{label_selector: $ls, namespace: $ns}')" \ + '["Create service resource or verify label selectors"]') + update_check_result --status "failed" --evidence "$EVIDENCE" return 1 fi -SERVICE_COUNT=$(echo "$SERVICES" | wc -w) +SERVICE_NAMES_JSON=$(jq -c '[.items[].metadata.name]' "$SERVICES_FILE" 2>/dev/null) +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') print_success "Found $SERVICE_COUNT service(s): $SERVICES" -update_check_result --status "success" --evidence "{}" + +EVIDENCE=$(evidence_json \ + "Found $SERVICE_COUNT service(s) in namespace $NAMESPACE" \ + "info" \ + "[]" \ + "$(jq -nc --argjson count "$SERVICE_COUNT" --argjson names "$SERVICE_NAMES_JSON" --arg ns "$NAMESPACE" \ + '{service_count: $count, service_names: $names, namespace: $ns}')" \ + "[]") +update_check_result --status "success" --evidence "$EVIDENCE" diff --git a/k8s/diagnose/service/service_port_configuration b/k8s/diagnose/service/service_port_configuration index 79baa675..098dd4e4 100644 --- a/k8s/diagnose/service/service_port_configuration +++ b/k8s/diagnose/service/service_port_configuration @@ -2,36 +2,36 @@ # Check: Service Port Configuration # Validates service and container port alignment -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_PORT_ISSUES=0 +PORT_FACTS=() +AFFECTED_SERVICES="" + for SERVICE_NAME in $SERVICES; do - # Get service info from pre-collected data SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) - # Get service ports and targetPorts SERVICE_PORTS=$(echo "$SERVICE_INFO" | jq -r '.spec.ports[] | "\(.port):\(.targetPort):\(.name // "unnamed")"') if [[ -z "$SERVICE_PORTS" ]]; then - HAS_PORT_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error "Service $SERVICE_NAME: No ports defined" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_ports_defined", ports: []}') + add_fact PORT_FACTS "$FACT" continue fi - # Get service selector to find pods SERVICE_SELECTORS=$(echo "$SERVICE_INFO" | jq -c '.spec.selector') if [[ -z "$SERVICE_SELECTORS" || "$SERVICE_SELECTORS" == "null" ]]; then print_warning "Service $SERVICE_NAME: No selector, skipping port validation" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_selector_skipped"}') + add_fact PORT_FACTS "$FACT" continue fi - # Find pods from pre-collected data that match service selectors PODS=$(jq -r --argjson selectors "$SERVICE_SELECTORS" ' .items[] | . as $pod | @@ -45,24 +45,24 @@ for SERVICE_NAME in $SERVICES; do if [[ -z "$PODS" ]]; then print_warning "Service $SERVICE_NAME: No pods found to validate ports" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_pods_for_validation"}') + add_fact PORT_FACTS "$FACT" continue fi - # Check first pod for port validation FIRST_POD=$(echo "$PODS" | awk '{print $1}') POD_INFO=$(jq --arg name "$FIRST_POD" '.items[] | select(.metadata.name == $name)' "$PODS_FILE" 2>/dev/null) print_info "Service $SERVICE_NAME port configuration:" - # Validate configuration and test connectivity - # Use process substitution to avoid subshell and preserve HAS_PORT_ISSUES updates + PORT_RESULTS=() + SERVICE_HAS_ISSUE=0 + while IFS=':' read -r SERVICE_PORT TARGET_PORT PORT_NAME; do ACTUAL_TARGET_PORT="$TARGET_PORT" CONTAINER_NAME="" - # Check if targetPort is numeric or named if [[ "$TARGET_PORT" =~ ^[0-9]+$ ]]; then - # Numeric targetPort - find which container has this port CONTAINER_INFO=$(echo "$POD_INFO" | jq -r --arg port "$TARGET_PORT" ' .spec.containers[] | select(.ports[]?.containerPort == ($port | tonumber)) | @@ -72,17 +72,22 @@ for SERVICE_NAME in $SERVICES; do if [[ -n "$CONTAINER_INFO" ]]; then CONTAINER_NAME=$(echo "$CONTAINER_INFO" | cut -d':' -f1) print_success " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Configuration OK [container: $CONTAINER_NAME]" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" --arg cont "$CONTAINER_NAME" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, container: $cont, status: "ok"}') else - HAS_PORT_ISSUES=1 - # Show available ports per container - AVAILABLE_PORTS=$(echo "$POD_INFO" | jq -r '.spec.containers[] | "\(.name): \([.ports[]?.containerPort] | join(","))"' | tr '\n' '; ') + SERVICE_HAS_ISSUE=1 + AVAILABLE_PORTS=$(echo "$POD_INFO" | jq -c '[.spec.containers[] | {container: .name, ports: [.ports[]?.containerPort]}]') print_error " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Container port $TARGET_PORT not found" - print_warning " Available ports by container: $AVAILABLE_PORTS" + AVAILABLE_HUMAN=$(echo "$POD_INFO" | jq -r '.spec.containers[] | "\(.name): \([.ports[]?.containerPort] | join(","))"' | tr '\n' '; ') + print_warning " Available ports by container: $AVAILABLE_HUMAN" print_action "Update service targetPort to match container port or fix container port" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" \ + --argjson available "$AVAILABLE_PORTS" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, status: "container_port_not_found", available_ports_by_container: $available}') + add_fact PORT_RESULTS "$PORT_RESULT" continue fi else - # Named port - find which container has this named port CONTAINER_INFO=$(echo "$POD_INFO" | jq -r --arg portname "$TARGET_PORT" ' .spec.containers[] | select(.ports[]? | select(.name == $portname)) | @@ -93,34 +98,63 @@ for SERVICE_NAME in $SERVICES; do CONTAINER_NAME=$(echo "$CONTAINER_INFO" | cut -d':' -f1) ACTUAL_TARGET_PORT=$(echo "$CONTAINER_INFO" | cut -d':' -f2) print_success " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Resolves to $ACTUAL_TARGET_PORT [container: $CONTAINER_NAME]" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" --arg cont "$CONTAINER_NAME" --arg actual "$ACTUAL_TARGET_PORT" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, container: $cont, resolved_port: $actual, status: "ok"}') else - HAS_PORT_ISSUES=1 + SERVICE_HAS_ISSUE=1 print_error " Port $SERVICE_PORT -> $TARGET_PORT ($PORT_NAME): Named port not found in containers" print_action "Define named port in container spec or use numeric targetPort" + PORT_RESULT=$(jq -nc --arg svc_port "$SERVICE_PORT" --arg target "$TARGET_PORT" --arg name "$PORT_NAME" \ + '{service_port: $svc_port, target_port: $target, port_name: $name, status: "named_port_not_found"}') + add_fact PORT_RESULTS "$PORT_RESULT" continue fi fi - # Active connectivity check - verify application is listening on the port + # Active connectivity check print_info " Testing connectivity to port $ACTUAL_TARGET_PORT in container '$CONTAINER_NAME'..." - - # Try to connect to the port from inside the specific container - CONNECTIVITY_TEST=$(kubectl exec "$FIRST_POD" -n "$NAMESPACE" -c "$CONTAINER_NAME" -- timeout 2 sh -c "command -v nc >/dev/null 2>&1 && nc -z localhost $ACTUAL_TARGET_PORT || (command -v curl >/dev/null 2>&1 && curl -s --max-time 1 localhost:$ACTUAL_TARGET_PORT >/dev/null)" 2>&1) + kubectl exec "$FIRST_POD" -n "$NAMESPACE" -c "$CONTAINER_NAME" -- timeout 2 sh -c "command -v nc >/dev/null 2>&1 && nc -z localhost $ACTUAL_TARGET_PORT || (command -v curl >/dev/null 2>&1 && curl -s --max-time 1 localhost:$ACTUAL_TARGET_PORT >/dev/null)" 2>&1 >/dev/null CONNECTIVITY_EXIT_CODE=$? if [[ $CONNECTIVITY_EXIT_CODE -eq 0 ]]; then print_success " ✓ Port $ACTUAL_TARGET_PORT is accepting connections" + PORT_RESULT=$(echo "$PORT_RESULT" | jq '. + {connectivity: "ok"}') else - HAS_PORT_ISSUES=1 + SERVICE_HAS_ISSUE=1 print_error " ✗ Port $ACTUAL_TARGET_PORT is NOT accepting connections" print_warning " Configuration is correct but application may not be listening on port $ACTUAL_TARGET_PORT" print_info " Check logs: kubectl logs $FIRST_POD -n $NAMESPACE -c $CONTAINER_NAME" + PORT_RESULT=$(echo "$PORT_RESULT" | jq '. + {connectivity: "refused", status: "not_listening"}') fi + + add_fact PORT_RESULTS "$PORT_RESULT" done < <(echo "$SERVICE_PORTS") + + [[ $SERVICE_HAS_ISSUE -eq 1 ]] && mark_affected AFFECTED_SERVICES "$SERVICE_NAME" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --argjson ports "$(facts_to_json_array PORT_RESULTS)" \ + '{service: $svc, ports: $ports}') + add_fact PORT_FACTS "$FACT" done -if [[ $HAS_PORT_ISSUES -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) have valid port configuration" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array PORT_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) have port configuration issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array PORT_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + '{service_count: $count, issue_count: ($facts | map(select(.ports[]? | .status != "ok")) | length), services: $facts}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Verify container is listening on targetPort and that ports/protocols match"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/service/service_selector_match b/k8s/diagnose/service/service_selector_match index 84667a7e..a69a02a8 100644 --- a/k8s/diagnose/service/service_selector_match +++ b/k8s/diagnose/service/service_selector_match @@ -2,32 +2,29 @@ # Check: Service Selector Match # Validates service selectors match pod labels -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_MISMATCH=0 +SELECTOR_FACTS=() +AFFECTED_SERVICES="" + for SERVICE_NAME in $SERVICES; do - # Get service info from pre-collected data SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) - # Get service selectors SERVICE_SELECTORS=$(echo "$SERVICE_INFO" | jq -r '.spec.selector | to_entries | map("\(.key)=\(.value)") | join(",")') + SELECTOR_OBJECT=$(echo "$SERVICE_INFO" | jq -c '.spec.selector') if [[ -z "$SERVICE_SELECTORS" || "$SERVICE_SELECTORS" == "null" ]]; then - HAS_MISMATCH=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error "Service $SERVICE_NAME: No selector defined" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" '{service: $svc, issue: "no_selector"}') + add_fact SELECTOR_FACTS "$FACT" continue fi - # Find pods that match service selector from pre-collected data - # Get service selector as a proper object - SELECTOR_OBJECT=$(echo "$SERVICE_INFO" | jq -c '.spec.selector') - - # Match pods where all service selectors are present in pod labels MATCHING_PODS=$(jq -r --argjson selectors "$SELECTOR_OBJECT" ' .items[] | . as $pod | @@ -40,30 +37,33 @@ for SERVICE_NAME in $SERVICES; do ' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') if [[ -z "$MATCHING_PODS" ]]; then - HAS_MISMATCH=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error "Service $SERVICE_NAME: No pods match selector ($SERVICE_SELECTORS)" - # Show what pods exist with deployment_id from pre-collected data EXISTING_PODS=$(jq -r --arg dep_id "$DEPLOYMENT_ID" '.items[] | select(.metadata.labels.deployment_id == $dep_id) | .metadata.name' "$PODS_FILE" 2>/dev/null | tr '\n' ' ') + MISMATCH_FACTS=() + MISMATCH_FACTS_JSON="[]" + if [[ -n "$EXISTING_PODS" ]]; then print_warning " Existing pods with deployment_id: $EXISTING_PODS" - # Show first pod's labels for comparison FIRST_POD=$(echo "$EXISTING_PODS" | awk '{print $1}') POD_LABELS=$(jq -r --arg pod "$FIRST_POD" '.items[] | select(.metadata.name == $pod) | .metadata.labels | to_entries | map("\(.key)=\(.value)") | join(",")' "$PODS_FILE" 2>/dev/null) print_info " Pod labels: $POD_LABELS" - # Check each selector against pod labels and show only mismatches - MISMATCHES="" MATCH_COUNT=0 SELECTOR_COUNT=0 while IFS='=' read -r key value; do SELECTOR_COUNT=$((SELECTOR_COUNT + 1)) POD_VALUE=$(jq -r --arg pod "$FIRST_POD" --arg key "$key" '.items[] | select(.metadata.name == $pod) | .metadata.labels[$key] // "MISSING"' "$PODS_FILE" 2>/dev/null) if [[ "$POD_VALUE" == "MISSING" ]]; then - MISMATCHES="${MISMATCHES} ✗ $key: selector='$value', pod=MISSING\n" + MM=$(jq -nc --arg key "$key" --arg expected "$value" --arg actual "MISSING" \ + '{key: $key, selector_value: $expected, pod_value: $actual, kind: "missing"}') + add_fact MISMATCH_FACTS "$MM" elif [[ "$POD_VALUE" != "$value" ]]; then - MISMATCHES="${MISMATCHES} ✗ $key: selector='$value', pod='$POD_VALUE'\n" + MM=$(jq -nc --arg key "$key" --arg expected "$value" --arg actual "$POD_VALUE" \ + '{key: $key, selector_value: $expected, pod_value: $actual, kind: "mismatch"}') + add_fact MISMATCH_FACTS "$MM" else MATCH_COUNT=$((MATCH_COUNT + 1)) fi @@ -71,35 +71,50 @@ for SERVICE_NAME in $SERVICES; do print_info " Selector check: $MATCH_COUNT/$SELECTOR_COUNT labels match" - if [[ -n "$MISMATCHES" ]]; then + MISMATCH_FACTS_JSON=$(facts_to_json_array MISMATCH_FACTS) + if [[ ${#MISMATCH_FACTS[@]} -gt 0 ]]; then print_warning " Selector mismatches:" - echo -e "$MISMATCHES" - else - print_warning " All selectors match but jq query failed - checking jq logic..." - # Debug: try the query manually to see what happens - DEBUG_RESULT=$(jq --argjson selectors "$SELECTOR_OBJECT" --arg pod "$FIRST_POD" ' - .items[] | select(.metadata.name == $pod) | - . as $p | - { - pod: .metadata.name, - matches: ($selectors | to_entries | all(.key as $k | .value as $v | - $p.metadata.labels[$k] == $v - )) - } - ' "$PODS_FILE" 2>&1) - print_info " Debug result: $DEBUG_RESULT" + echo "$MISMATCH_FACTS_JSON" | jq -r '.[] | " ✗ " + .key + ": selector=" + .selector_value + ", pod=" + .pod_value' | while IFS= read -r line; do + print_warning "$line" + done fi print_action "Verify pod labels match service selector" fi + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --argjson sel "$SELECTOR_OBJECT" \ + --argjson mismatches "$MISMATCH_FACTS_JSON" \ + --arg existing_pods "$EXISTING_PODS" \ + '{service: $svc, issue: "no_matching_pods", selector: $sel, existing_pods_with_deployment_id: ($existing_pods | split(" ") | map(select(length > 0))), label_mismatches: $mismatches}') + add_fact SELECTOR_FACTS "$FACT" else - POD_COUNT=$(echo "$MATCHING_PODS" | wc -w) + POD_COUNT=$(echo "$MATCHING_PODS" | wc -w | tr -d ' ') print_success "Service $SERVICE_NAME: Selector matches $POD_COUNT pod(s)" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --argjson sel "$SELECTOR_OBJECT" \ + --argjson matched "$POD_COUNT" \ + '{service: $svc, selector: $sel, matched_pod_count: $matched}') + add_fact SELECTOR_FACTS "$FACT" fi done -if [[ $HAS_MISMATCH -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') +ISSUE_COUNT=$(echo "$AFFECTED_SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) match at least one pod" \ + "info" \ + "[]" \ + "$(jq -nc --argjson facts "$(facts_to_json_array SELECTOR_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $facts}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) have selector issues" + DETAILS=$(jq -nc \ + --argjson facts "$(facts_to_json_array SELECTOR_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + '{service_count: $count, issue_count: ($facts | map(select(.issue != null)) | length), services: $facts}') + EVIDENCE=$(evidence_json "$SUMMARY" "critical" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Verify pod labels match service selector"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/service/service_type_validation b/k8s/diagnose/service/service_type_validation index 8381c2d7..feacbbbe 100644 --- a/k8s/diagnose/service/service_type_validation +++ b/k8s/diagnose/service/service_type_validation @@ -2,75 +2,111 @@ # Check: Service Type Validation # Verifies service type is correctly configured -# Validate services exist require_services || return 0 -# Read services from pre-collected data SERVICES=$(jq -r '.items[].metadata.name' "$SERVICES_FILE" 2>/dev/null | tr '\n' ' ') -HAS_ISSUES=0 +ISSUE_FACTS=() +SERVICE_FACTS=() +AFFECTED_SERVICES="" + for SERVICE_NAME in $SERVICES; do - # Get service info from pre-collected data SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null) - SERVICE_TYPE=$(echo "$SERVICE_INFO" | jq -r '.spec.type') print_info "Service $SERVICE_NAME: Type=$SERVICE_TYPE" + # Per-service fact (regardless of issue) case "$SERVICE_TYPE" in ClusterIP) CLUSTER_IP=$(echo "$SERVICE_INFO" | jq -r '.spec.clusterIP') if [[ "$CLUSTER_IP" == "None" ]]; then print_success " Headless service (ClusterIP: None)" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" \ + '{service: $svc, type: $type, cluster_ip: "None", headless: true}') else print_success " Internal service with ClusterIP: $CLUSTER_IP" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --arg ip "$CLUSTER_IP" \ + '{service: $svc, type: $type, cluster_ip: $ip, headless: false}') fi ;; NodePort) - NODE_PORTS=$(echo "$SERVICE_INFO" | jq -r '.spec.ports[] | "\(.port):\(.nodePort)"') + NODE_PORTS_LIST=$(echo "$SERVICE_INFO" | jq -c '[.spec.ports[] | {port: .port, node_port: .nodePort}]') print_success " NodePort service exposed on:" - echo "$NODE_PORTS" | while IFS=':' read -r PORT NODE_PORT; do - print_info " Port $PORT -> NodePort $NODE_PORT" + echo "$SERVICE_INFO" | jq -r '.spec.ports[] | " Port \(.port) -> NodePort \(.nodePort)"' | while IFS= read -r line; do + print_info "$line" done + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --argjson ports "$NODE_PORTS_LIST" \ + '{service: $svc, type: $type, node_ports: $ports}') ;; LoadBalancer) EXTERNAL_IP=$(echo "$SERVICE_INFO" | jq -r '.status.loadBalancer.ingress[0].ip // .status.loadBalancer.ingress[0].hostname // "Pending"') if [[ "$EXTERNAL_IP" == "Pending" || "$EXTERNAL_IP" == "null" ]]; then - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_warning " LoadBalancer IP/Hostname is Pending" print_info " This may take a few minutes to provision" - # Check for events related to LoadBalancer from pre-collected data LB_EVENTS=$(jq -r --arg name "$SERVICE_NAME" '.items[] | select(.involvedObject.name == $name and (.message | test("loadbalancer|external"; "i"))) | "\(.lastTimestamp) \(.message)"' "$EVENTS_FILE" 2>/dev/null | tail -n 3) if [[ -n "$LB_EVENTS" ]]; then print_info " Recent events:" echo "$LB_EVENTS" | sed 's/^/ /' fi print_action "Wait for provisioning or check cloud provider logs for errors" + + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" \ + '{service: $svc, type: $type, issue: "loadbalancer_pending"}') + add_fact ISSUE_FACTS "$FACT" + SVC_FACT="$FACT" else print_success " LoadBalancer available at: $EXTERNAL_IP" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --arg addr "$EXTERNAL_IP" \ + '{service: $svc, type: $type, external_address: $addr}') fi ;; ExternalName) EXTERNAL_NAME=$(echo "$SERVICE_INFO" | jq -r '.spec.externalName') print_success " ExternalName service pointing to: $EXTERNAL_NAME" + SVC_FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "$SERVICE_TYPE" --arg ext "$EXTERNAL_NAME" \ + '{service: $svc, type: $type, external_name: $ext}') ;; *) - HAS_ISSUES=1 + mark_affected AFFECTED_SERVICES "$SERVICE_NAME" print_error " Unknown service type: $SERVICE_TYPE" print_action "Use valid service type (ClusterIP, NodePort, LoadBalancer, or ExternalName)" + FACT=$(jq -nc --arg svc "$SERVICE_NAME" --arg type "${SERVICE_TYPE:-null}" \ + '{service: $svc, type: $type, issue: "unknown_service_type"}') + add_fact ISSUE_FACTS "$FACT" + SVC_FACT="$FACT" ;; esac + add_fact SERVICE_FACTS "$SVC_FACT" done -if [[ $HAS_ISSUES -eq 0 ]]; then - update_check_result --status "success" --evidence "{}" +ISSUE_COUNT=$(echo "$ISSUE_FACTS" | jq 'length') +SERVICE_COUNT=$(echo "$SERVICES" | wc -w | tr -d ' ') + +if [[ $ISSUE_COUNT -eq 0 ]]; then + EVIDENCE=$(evidence_json \ + "All $SERVICE_COUNT service(s) have valid types" \ + "info" \ + "[]" \ + "$(jq -nc --argjson services "$(facts_to_json_array SERVICE_FACTS)" --argjson count "$SERVICE_COUNT" '{service_count: $count, services: $services}')" \ + "[]") + update_check_result --status "success" --evidence "$EVIDENCE" else - update_check_result --status "failed" --evidence "{}" -fi \ No newline at end of file + SUMMARY="$ISSUE_COUNT of $SERVICE_COUNT service(s) have type issues" + DETAILS=$(jq -nc \ + --argjson services "$(facts_to_json_array SERVICE_FACTS)" \ + --argjson issues "$(facts_to_json_array ISSUE_FACTS)" \ + --argjson count "$SERVICE_COUNT" \ + '{service_count: $count, issue_count: ($issues | length), services: $services, issues: $issues}') + EVIDENCE=$(evidence_json "$SUMMARY" "warning" "$(set_to_json_array AFFECTED_SERVICES)" "$DETAILS" \ + '["Wait for provisioning or check cloud provider logs for errors"]') + update_check_result --status "failed" --evidence "$EVIDENCE" +fi diff --git a/k8s/diagnose/tests/build_context.bats b/k8s/diagnose/tests/build_context.bats index 46eaa5e2..dfa68081 100644 --- a/k8s/diagnose/tests/build_context.bats +++ b/k8s/diagnose/tests/build_context.bats @@ -25,12 +25,15 @@ setup() { *"app.kubernetes.io/name=aws-load-balancer-controller"*) echo '{"items":[]}' ;; *"app=aws-alb-ingress-controller"*) echo '{"items":[]}' ;; *"get pods"*) echo '{"items":[{"metadata":{"name":"test-pod"}}]}' ;; + *"get deployment"*) echo '{"items":[{"metadata":{"name":"test-deployment"}}]}' ;; + *"get rs"*) echo '{"items":[{"metadata":{"name":"test-rs"}}]}' ;; *"get services"*) echo '{"items":[{"metadata":{"name":"test-service"}}]}' ;; *"get endpoints"*) echo '{"items":[]}' ;; *"get ingress"*) echo '{"items":[]}' ;; *"get secrets"*) echo '{"items":[]}' ;; *"get ingressclass"*) echo '{"items":[]}' ;; *"get events"*) echo '{"items":[]}' ;; + *"describe pod"*) echo "Pod describe output" ;; *"logs"*) echo "log line 1" ;; *) echo '{"items":[]}' ;; esac @@ -99,13 +102,19 @@ run_build_context() { assert_directory_exists "$NP_OUTPUT_DIR/data" assert_directory_exists "$NP_OUTPUT_DIR/data/alb_controller_logs" + assert_directory_exists "$POD_LOGS_DIR" + assert_directory_exists "$POD_DESCRIBE_DIR" # All resource files should exist and be valid JSON - for file in "$PODS_FILE" "$SERVICES_FILE" "$ENDPOINTS_FILE" "$INGRESSES_FILE" \ - "$SECRETS_FILE" "$INGRESSCLASSES_FILE" "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE"; do + for file in "$PODS_FILE" "$DEPLOYMENTS_FILE" "$REPLICASETS_FILE" "$SERVICES_FILE" \ + "$ENDPOINTS_FILE" "$INGRESSES_FILE" "$SECRETS_FILE" "$INGRESSCLASSES_FILE" \ + "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE"; do assert_file_exists "$file" jq . "$file" >/dev/null done + + # problematic_pods.txt is plain text, just assert it exists + assert_file_exists "$PROBLEMATIC_PODS_FILE" } @test "build_context: secrets.json excludes sensitive data field" { @@ -183,3 +192,264 @@ run_build_context() { log_content=$(cat "$ALB_CONTROLLER_LOGS_DIR/alb-controller-pod.log") assert_contains "$log_content" "controller log line" } + +# ============================================================================= +# Problematic Pod Detection +# ============================================================================= +@test "build_context: healthy running pod is not flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"healthy-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Running", + "conditions":[{"type":"Ready","status":"True"}], + "containerStatuses":[{"name":"app","restartCount":0,"state":{"running":{}},"lastState":{}}] + } + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_empty "$problematic" +} + +@test "build_context: pod in CrashLoopBackOff is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"crash-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Running", + "conditions":[{"type":"Ready","status":"False"}], + "containerStatuses":[{"name":"app","restartCount":5,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *"describe pod crash-pod"*) echo "describe output for crash-pod" ;; + *"logs"*"crash-pod"*"--previous"*) echo "previous crash log" ;; + *"logs"*"crash-pod"*) echo "current log" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "crash-pod" +} + +@test "build_context: pod in Pending phase is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"pending-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Pending"} + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "pending-pod" +} + +@test "build_context: pod with terminating deletionTimestamp is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"terminating-pod","deletionTimestamp":"2026-01-01T00:00:00Z"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Running","conditions":[{"type":"Ready","status":"True"}],"containerStatuses":[{"name":"app","restartCount":0,"state":{"running":{}},"lastState":{}}]} + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "terminating-pod" +} + +@test "build_context: pod with failed init container is flagged as problematic" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"init-fail-pod"}, + "spec":{"initContainers":[{"name":"init-db"}],"containers":[{"name":"app"}]}, + "status":{ + "phase":"Pending", + "initContainerStatuses":[{"name":"init-db","restartCount":3,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + problematic=$(cat "$PROBLEMATIC_PODS_FILE") + assert_contains "$problematic" "init-fail-pod" +} + +# ============================================================================= +# Pod Logs and Describe Capture +# ============================================================================= +@test "build_context: captures describe and current logs for problematic pod" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"crash-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Running", + "containerStatuses":[{"name":"app","restartCount":2,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *"describe pod crash-pod"*) echo "describe output for crash-pod" ;; + *"logs"*"crash-pod"*"--previous"*) echo "previous crash log" ;; + *"logs"*"crash-pod"*) echo "current log line" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + assert_file_exists "$POD_DESCRIBE_DIR/crash-pod.txt" + describe_content=$(cat "$POD_DESCRIBE_DIR/crash-pod.txt") + assert_contains "$describe_content" "describe output for crash-pod" + + assert_file_exists "$POD_LOGS_DIR/crash-pod.app.log" + current_log=$(cat "$POD_LOGS_DIR/crash-pod.app.log") + assert_contains "$current_log" "current log line" + + assert_file_exists "$POD_LOGS_DIR/crash-pod.app.previous.log" + previous_log=$(cat "$POD_LOGS_DIR/crash-pod.app.previous.log") + assert_contains "$previous_log" "previous crash log" +} + +@test "build_context: skips empty previous logs (container never crashed before)" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"new-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{ + "phase":"Pending", + "containerStatuses":[{"name":"app","restartCount":0,"state":{"waiting":{"reason":"ImagePullBackOff"}},"lastState":{}}] + } + }]}' + ;; + *"describe pod new-pod"*) echo "describe output" ;; + *"logs"*"new-pod"*"--previous"*) return 1 ;; + *"logs"*"new-pod"*) echo "current log" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + # Current log should be saved + assert_file_exists "$POD_LOGS_DIR/new-pod.app.log" + + # Previous log should NOT exist (kubectl returned no output) + [ ! -f "$POD_LOGS_DIR/new-pod.app.previous.log" ] +} + +@test "build_context: captures logs for all containers including init containers" { + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"multi-container-pod"}, + "spec":{ + "initContainers":[{"name":"init-db"}], + "containers":[{"name":"app"},{"name":"sidecar"}] + }, + "status":{ + "phase":"Pending", + "initContainerStatuses":[{"name":"init-db","restartCount":1,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1}}}] + } + }]}' + ;; + *"describe pod multi-container-pod"*) echo "describe output" ;; + *"logs"*"-c init-db"*"--previous"*) echo "init previous" ;; + *"logs"*"-c init-db"*) echo "init current" ;; + *"logs"*"-c app"*"--previous"*) return 1 ;; + *"logs"*"-c app"*) echo "app current" ;; + *"logs"*"-c sidecar"*"--previous"*) return 1 ;; + *"logs"*"-c sidecar"*) echo "sidecar current" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + # All three containers' current logs should exist + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.init-db.log" + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.app.log" + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.sidecar.log" + + # Only init-db has a previous log + assert_file_exists "$POD_LOGS_DIR/multi-container-pod.init-db.previous.log" + [ ! -f "$POD_LOGS_DIR/multi-container-pod.app.previous.log" ] + [ ! -f "$POD_LOGS_DIR/multi-container-pod.sidecar.previous.log" ] +} + +@test "build_context: respects POD_LOG_TAIL_LINES env var" { + export POD_LOG_TAIL_LINES=42 + + # Capture the kubectl invocation to verify --tail value + kubectl() { + case "$*" in + *"get pods"*) + echo '{"items":[{ + "metadata":{"name":"crash-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Pending","containerStatuses":[{"name":"app","restartCount":1,"state":{"waiting":{"reason":"ImagePullBackOff"}},"lastState":{}}]} + }]}' + ;; + *"logs"*"--tail=42"*) echo "tail-42-honored" ;; + *"logs"*) echo "WRONG: tail value was not 42" ;; + *"describe"*) echo "describe" ;; + *) echo '{"items":[]}' ;; + esac + } + export -f kubectl + + run_build_context + + log_content=$(cat "$POD_LOGS_DIR/crash-pod.app.log") + assert_contains "$log_content" "tail-42-honored" + + unset POD_LOG_TAIL_LINES +} diff --git a/k8s/diagnose/tests/diagnose_utils.bats b/k8s/diagnose/tests/diagnose_utils.bats index 4080bd72..bb218b81 100644 --- a/k8s/diagnose/tests/diagnose_utils.bats +++ b/k8s/diagnose/tests/diagnose_utils.bats @@ -77,6 +77,66 @@ strip_ansi() { assert_contains "$clean" "🔧 Action message" } +# ============================================================================= +# evidence_json +# ============================================================================= +@test "evidence_json: builds full schema from all arguments" { + result=$(evidence_json "Test summary" "critical" '["pod-1","pod-2"]' '{"k":"v"}' '["fix it"]') + + summary=$(echo "$result" | jq -r '.summary') + assert_equal "$summary" "Test summary" + + severity=$(echo "$result" | jq -r '.severity') + assert_equal "$severity" "critical" + + affected=$(echo "$result" | jq -c '.affected') + assert_equal "$affected" '["pod-1","pod-2"]' + + details_k=$(echo "$result" | jq -r '.details.k') + assert_equal "$details_k" "v" + + action_0=$(echo "$result" | jq -r '.suggested_actions[0]') + assert_equal "$action_0" "fix it" +} + +@test "evidence_json: applies sane defaults for empty optional fields" { + result=$(evidence_json "Quick check" "info" "" "" "") + + affected=$(echo "$result" | jq -c '.affected') + assert_equal "$affected" "[]" + + details=$(echo "$result" | jq -c '.details') + assert_equal "$details" "{}" + + actions=$(echo "$result" | jq -c '.suggested_actions') + assert_equal "$actions" "[]" +} + +@test "evidence_json: emits valid JSON consumable by update_check_result" { + result=$(evidence_json "S" "warning" '["x"]' '{"a":1}' '["b"]') + + # Should be parseable by jq without errors + parsed=$(echo "$result" | jq -c .) + assert_equal "$parsed" '{"summary":"S","severity":"warning","affected":["x"],"details":{"a":1},"suggested_actions":["b"]}' +} + +# ============================================================================= +# exit_code_meaning +# ============================================================================= +@test "exit_code_meaning: maps known codes" { + assert_equal "$(exit_code_meaning 0)" "Clean exit (container finished successfully)" + assert_equal "$(exit_code_meaning 1)" "Application error" + assert_equal "$(exit_code_meaning 137)" "OOMKilled (out of memory)" + assert_equal "$(exit_code_meaning 139)" "SIGSEGV (segmentation fault)" + assert_equal "$(exit_code_meaning 143)" "SIGTERM (graceful termination)" +} + +@test "exit_code_meaning: returns Unknown for unmapped codes" { + assert_equal "$(exit_code_meaning 42)" "Unknown" + assert_equal "$(exit_code_meaning N/A)" "Unknown" + assert_equal "$(exit_code_meaning '')" "Unknown" +} + # ============================================================================= # require_resources # ============================================================================= @@ -97,6 +157,41 @@ strip_ansi() { assert_contains "$clean" "⚠ No pods found with labels app=test in namespace default, check was skipped." } +@test "require_resources: emits skipped evidence following the schema" { + # Capture the evidence passed to update_check_result + local captured_evidence_file="$(mktemp)" + update_check_result() { + while [[ $# -gt 0 ]]; do + case "$1" in + --evidence) echo "$2" > "$captured_evidence_file"; shift 2 ;; + *) shift ;; + esac + done + return 0 + } + export -f update_check_result + + require_resources "pods" "" "scope_id=999" "production" || true + + # Validate the schema of the captured evidence + local evidence + evidence=$(cat "$captured_evidence_file") + + severity=$(echo "$evidence" | jq -r '.severity') + assert_equal "$severity" "info" + + summary=$(echo "$evidence" | jq -r '.summary') + assert_contains "$summary" "skipped" + + resource_type=$(echo "$evidence" | jq -r '.details.resource_type') + assert_equal "$resource_type" "pods" + + label_selector=$(echo "$evidence" | jq -r '.details.label_selector') + assert_equal "$label_selector" "scope_id=999" + + rm -f "$captured_evidence_file" +} + # ============================================================================= # require_pods / require_services / require_ingresses # ============================================================================= @@ -255,7 +350,7 @@ strip_ansi() { # ============================================================================= # update_check_result - Log Limits # ============================================================================= -@test "update_check_result: limits logs to 20 lines" { +@test "update_check_result: limits logs to 20 lines by default" { for i in {1..30}; do echo "log line $i" >> "$SCRIPT_LOG_FILE" done @@ -266,6 +361,33 @@ strip_ansi() { [ "$logs_count" -le 20 ] } +@test "update_check_result: --log-tail-lines overrides the default cap" { + for i in {1..100}; do + echo "log line $i" >> "$SCRIPT_LOG_FILE" + done + + update_check_result --status "success" --evidence "{}" --log-tail-lines 80 + + logs_count=$(jq -r '.logs | length' "$SCRIPT_OUTPUT_FILE") + [ "$logs_count" = "80" ] + # Last entry should be the most recent line (line 100), oldest in window is line 21 + [ "$(jq -r '.logs[-1]' "$SCRIPT_OUTPUT_FILE")" = "log line 100" ] + [ "$(jq -r '.logs[0]' "$SCRIPT_OUTPUT_FILE")" = "log line 21" ] +} + +@test "update_check_result: --log-tail-lines below total preserves the most recent N lines" { + for i in {1..10}; do + echo "log line $i" >> "$SCRIPT_LOG_FILE" + done + + update_check_result --status "success" --evidence "{}" --log-tail-lines 5 + + logs_count=$(jq -r '.logs | length' "$SCRIPT_OUTPUT_FILE") + [ "$logs_count" = "5" ] + [ "$(jq -r '.logs[0]' "$SCRIPT_OUTPUT_FILE")" = "log line 6" ] + [ "$(jq -r '.logs[-1]' "$SCRIPT_OUTPUT_FILE")" = "log line 10" ] +} + # ============================================================================= # notify_results # ============================================================================= diff --git a/k8s/diagnose/tests/evidence_schema.bats b/k8s/diagnose/tests/evidence_schema.bats new file mode 100644 index 00000000..5309ba2d --- /dev/null +++ b/k8s/diagnose/tests/evidence_schema.bats @@ -0,0 +1,453 @@ +#!/usr/bin/env bats +# ============================================================================= +# Cross-cutting schema validation for all migrated checks. +# Verifies every check writes evidence following the documented schema: +# { summary, severity, affected, details, suggested_actions } +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + source "$BATS_TEST_DIRNAME/../utils/diagnose_utils" + + export NAMESPACE="test-ns" + export LABEL_SELECTOR="app=test" + export SCOPE_LABEL_SELECTOR="scope_id=123" + export DEPLOYMENT_ID="deploy-1" + export NP_OUTPUT_DIR="$(mktemp -d)" + export SCRIPT_OUTPUT_FILE="$(mktemp)" + export SCRIPT_LOG_FILE="$(mktemp)" + echo '{"status":"pending","evidence":{},"logs":[]}' > "$SCRIPT_OUTPUT_FILE" + + # Set up empty data files so every check can require_* + export PODS_FILE="$(mktemp)" + export SERVICES_FILE="$(mktemp)" + export ENDPOINTS_FILE="$(mktemp)" + export INGRESSES_FILE="$(mktemp)" + export SECRETS_FILE="$(mktemp)" + export INGRESSCLASSES_FILE="$(mktemp)" + export EVENTS_FILE="$(mktemp)" + export ALB_CONTROLLER_PODS_FILE="$(mktemp)" + export ALB_CONTROLLER_LOGS_DIR="$(mktemp -d)" + for f in "$PODS_FILE" "$SERVICES_FILE" "$ENDPOINTS_FILE" "$INGRESSES_FILE" \ + "$SECRETS_FILE" "$INGRESSCLASSES_FILE" "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE"; do + echo '{"items":[]}' > "$f" + done + + kubectl() { return 0; } + export -f kubectl +} + +teardown() { + rm -rf "$NP_OUTPUT_DIR" "$ALB_CONTROLLER_LOGS_DIR" + rm -f "$SCRIPT_OUTPUT_FILE" "$SCRIPT_LOG_FILE" "$PODS_FILE" "$SERVICES_FILE" \ + "$ENDPOINTS_FILE" "$INGRESSES_FILE" "$SECRETS_FILE" "$INGRESSCLASSES_FILE" \ + "$EVENTS_FILE" "$ALB_CONTROLLER_PODS_FILE" + unset -f kubectl +} + +reset_output() { + echo '{"status":"pending","evidence":{},"logs":[]}' > "$SCRIPT_OUTPUT_FILE" +} + +# Assert that the evidence object on $SCRIPT_OUTPUT_FILE has the canonical schema: +# summary (string), severity in {critical, warning, info}, +# affected (array), details (object), suggested_actions (array) +assert_evidence_schema() { + local check_name="$1" + + local summary severity affected_kind details_kind actions_kind + summary=$(jq -r '.evidence.summary // empty' "$SCRIPT_OUTPUT_FILE") + severity=$(jq -r '.evidence.severity // empty' "$SCRIPT_OUTPUT_FILE") + affected_kind=$(jq -r '.evidence.affected | type' "$SCRIPT_OUTPUT_FILE") + details_kind=$(jq -r '.evidence.details | type' "$SCRIPT_OUTPUT_FILE") + actions_kind=$(jq -r '.evidence.suggested_actions | type' "$SCRIPT_OUTPUT_FILE") + + [[ -n "$summary" ]] || { + echo "[$check_name] missing evidence.summary" + cat "$SCRIPT_OUTPUT_FILE" + return 1 + } + + case "$severity" in + critical|warning|info) ;; + *) echo "[$check_name] invalid severity: '$severity'"; return 1 ;; + esac + + [[ "$affected_kind" == "array" ]] || { echo "[$check_name] evidence.affected must be array, got $affected_kind"; return 1; } + [[ "$details_kind" == "object" ]] || { echo "[$check_name] evidence.details must be object, got $details_kind"; return 1; } + [[ "$actions_kind" == "array" ]] || { echo "[$check_name] evidence.suggested_actions must be array, got $actions_kind"; return 1; } +} + +# ============================================================================= +# Schema validation: skipped path (require_*) +# All checks that call require_pods/services/ingresses must produce schema +# evidence when the resource list is empty. +# ============================================================================= +SCOPE_CHECKS_REQUIRE_PODS=( + image_pull_status memory_limits_check resource_availability storage_mounting + container_port_health health_probe_endpoints pod_readiness container_crash_detection +) + +SERVICE_CHECKS_REQUIRE_SERVICES=( + service_selector_match service_endpoints service_port_configuration service_type_validation +) + +NETWORKING_CHECKS_REQUIRE_INGRESSES=( + ingress_class_validation ingress_host_rules ingress_backend_service + ingress_tls_configuration ingress_controller_sync alb_capacity_check +) + +@test "schema: scope checks emit valid skipped evidence when no pods" { + for check in "${SCOPE_CHECKS_REQUIRE_PODS[@]}"; do + reset_output + source "$BATS_TEST_DIRNAME/../scope/$check" || true + assert_evidence_schema "scope/$check (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "scope/$check expected status=skipped, got $status"; return 1; } + done +} + +@test "schema: service checks emit valid skipped evidence when no services" { + for check in "${SERVICE_CHECKS_REQUIRE_SERVICES[@]}"; do + reset_output + source "$BATS_TEST_DIRNAME/../service/$check" || true + assert_evidence_schema "service/$check (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "service/$check expected status=skipped, got $status"; return 1; } + done +} + +@test "schema: networking checks emit valid skipped evidence when no ingresses" { + for check in "${NETWORKING_CHECKS_REQUIRE_INGRESSES[@]}"; do + reset_output + source "$BATS_TEST_DIRNAME/../networking/$check" || true + assert_evidence_schema "networking/$check (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "networking/$check expected status=skipped, got $status"; return 1; } + done +} + +@test "schema: logs/application_log_evidence emits valid skipped evidence when no snapshot" { + reset_output + # PROBLEMATIC_PODS_FILE intentionally not set — the check must degrade gracefully. + unset PROBLEMATIC_PODS_FILE + source "$BATS_TEST_DIRNAME/../logs/application_log_evidence" || true + assert_evidence_schema "logs/application_log_evidence (skipped)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "skipped" ]] || { echo "expected skipped, got $status"; return 1; } +} + +# ============================================================================= +# Schema validation: failed path for "no resources" existence checks +# (these don't use require_*; they emit failed evidence directly) +# ============================================================================= +@test "schema: pod_existence emits valid failed evidence when no pods" { + reset_output + echo '{"items":[]}' > "$PODS_FILE" + source "$BATS_TEST_DIRNAME/../scope/pod_existence" || true + assert_evidence_schema "scope/pod_existence (failed)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "failed" ]] || { echo "expected failed, got $status"; return 1; } + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + [[ "$severity" == "critical" ]] || { echo "expected critical, got $severity"; return 1; } +} + +@test "schema: service_existence emits valid failed evidence when no services" { + reset_output + echo '{"items":[]}' > "$SERVICES_FILE" + source "$BATS_TEST_DIRNAME/../service/service_existence" || true + assert_evidence_schema "service/service_existence (failed)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "failed" ]] || return 1 +} + +@test "schema: ingress_existence emits valid failed evidence when no ingresses" { + reset_output + echo '{"items":[]}' > "$INGRESSES_FILE" + source "$BATS_TEST_DIRNAME/../networking/ingress_existence" || true + assert_evidence_schema "networking/ingress_existence (failed)" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + [[ "$status" == "failed" ]] || return 1 +} + +# ============================================================================= +# Schema validation: success path for existence checks +# ============================================================================= +@test "schema: existence checks emit valid info evidence when resources exist" { + # pod_existence + reset_output + echo '{"items":[{"metadata":{"name":"p1"}}]}' > "$PODS_FILE" + source "$BATS_TEST_DIRNAME/../scope/pod_existence" || true + assert_evidence_schema "scope/pod_existence (success)" + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "info" ]] || return 1 + + # service_existence + reset_output + echo '{"items":[{"metadata":{"name":"s1"}}]}' > "$SERVICES_FILE" + source "$BATS_TEST_DIRNAME/../service/service_existence" || true + assert_evidence_schema "service/service_existence (success)" + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "info" ]] || return 1 + + # ingress_existence + reset_output + echo '{"items":[{"metadata":{"name":"i1"},"spec":{"rules":[]}}]}' > "$INGRESSES_FILE" + source "$BATS_TEST_DIRNAME/../networking/ingress_existence" || true + assert_evidence_schema "networking/ingress_existence (success)" + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "info" ]] || return 1 +} + +# ============================================================================= +# A few targeted "critical" path checks with realistic failure data +# ============================================================================= +@test "schema: image_pull_status emits valid critical evidence with affected pods" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "p1"}, + "spec": {"containers":[{"name":"app","image":"foo:bar"}]}, + "status": {"containerStatuses":[{"name":"app","state":{"waiting":{"reason":"ImagePullBackOff","message":"pull failed"}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/image_pull_status" || true + assert_evidence_schema "scope/image_pull_status (failed)" + + [[ "$(jq -r '.status' "$SCRIPT_OUTPUT_FILE")" == "failed" ]] || return 1 + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "critical" ]] || return 1 + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + [[ "$affected" == '["p1"]' ]] || { echo "expected affected=[p1], got $affected"; return 1; } +} + +@test "schema: memory_limits_check emits valid critical evidence on OOMKilled" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "oom-pod"}, + "spec": {"containers":[{"name":"app","resources":{"limits":{"memory":"128Mi"},"requests":{"memory":"64Mi"}}}]}, + "status": {"containerStatuses":[{"name":"app","lastState":{"terminated":{"reason":"OOMKilled","exitCode":137}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/memory_limits_check" || true + assert_evidence_schema "scope/memory_limits_check (failed)" + + [[ "$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE")" == "critical" ]] || return 1 + oom=$(jq -r '.evidence.details.oom_killed[0].memory_limit' "$SCRIPT_OUTPUT_FILE") + [[ "$oom" == "128Mi" ]] || { echo "expected memory_limit=128Mi, got $oom"; return 1; } +} + +@test "schema: resource_availability emits valid critical evidence with insufficient_cpu flag" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "unsched"}, + "status": {"phase":"Pending","conditions":[{"type":"PodScheduled","status":"False","reason":"Unschedulable","message":"0/3 nodes available: insufficient cpu"}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/resource_availability" || true + assert_evidence_schema "scope/resource_availability (failed)" + + cpu=$(jq -r '.evidence.details.cluster_insufficient_cpu' "$SCRIPT_OUTPUT_FILE") + [[ "$cpu" == "true" ]] || { echo "expected insufficient_cpu=true, got $cpu"; return 1; } +} + +@test "schema: ingress_class_validation emits valid critical evidence on missing class" { + reset_output + cat > "$INGRESSES_FILE" << 'EOF' +{ + "items":[{"metadata":{"name":"my-ing"},"spec":{"ingressClassName":"missing-class"}}] +} +EOF + echo '{"items":[]}' > "$INGRESSCLASSES_FILE" + source "$BATS_TEST_DIRNAME/../networking/ingress_class_validation" || true + assert_evidence_schema "networking/ingress_class_validation (failed)" + + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + [[ "$affected" == '["my-ing"]' ]] || return 1 +} + +# ============================================================================= +# Embedded logs in evidence (Fase C — for AI post-mortem consumption) +# ============================================================================= + +# Helper: prepare a fake POD_LOGS_DIR with current/previous logs for a given pod+container +setup_pod_logs() { + local pod="$1" container="$2" current="$3" previous="$4" + export POD_LOGS_DIR="${POD_LOGS_DIR:-$(mktemp -d)}" + echo "$current" > "$POD_LOGS_DIR/${pod}.${container}.log" + if [[ -n "$previous" ]]; then + echo "$previous" > "$POD_LOGS_DIR/${pod}.${container}.previous.log" + fi +} + +@test "logs: read_log_tail returns [] when POD_LOGS_DIR unset or file missing" { + unset POD_LOGS_DIR + result=$(read_log_tail "any-pod" "any-container" "current") + [[ "$result" == "[]" ]] || { echo "expected [], got $result"; return 1; } + + export POD_LOGS_DIR="$(mktemp -d)" + result=$(read_log_tail "missing-pod" "missing-container" "previous") + [[ "$result" == "[]" ]] || { echo "expected [], got $result"; return 1; } +} + +@test "logs: read_log_tail returns lines as JSON array when log exists" { + export POD_LOGS_DIR="$(mktemp -d)" + printf 'line1\nline2\nline3\n' > "$POD_LOGS_DIR/p.c.log" + + result=$(read_log_tail "p" "c" "current") + count=$(echo "$result" | jq 'length') + [[ "$count" == "3" ]] || { echo "expected 3 lines, got $count"; return 1; } + + first=$(echo "$result" | jq -r '.[0]') + [[ "$first" == "line1" ]] || { echo "expected line1, got $first"; return 1; } +} + +@test "logs: read_log_tail respects EVIDENCE_LOG_TAIL_LINES" { + export POD_LOGS_DIR="$(mktemp -d)" + for i in $(seq 1 100); do echo "line $i"; done > "$POD_LOGS_DIR/p.c.log" + + EVIDENCE_LOG_TAIL_LINES=5 result=$(read_log_tail "p" "c" "current") + count=$(echo "$result" | jq 'length') + [[ "$count" == "5" ]] || { echo "expected 5 lines, got $count"; return 1; } + + # Last 5 lines should be 96..100 + last=$(echo "$result" | jq -r '.[-1]') + [[ "$last" == "line 100" ]] || { echo "expected 'line 100', got '$last'"; return 1; } +} + +@test "logs: container_crash_detection embeds previous logs in CrashLoopBackOff fact" { + reset_output + setup_pod_logs "crash-pod" "app" "" "Caused by: NullPointerException at line 42" + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"crash-pod"}, + "status":{"containerStatuses":[{"name":"app","restartCount":5,"state":{"waiting":{"reason":"CrashLoopBackOff"}},"lastState":{"terminated":{"exitCode":1,"reason":"Error"}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/container_crash_detection" || true + + prev_logs=$(jq -c '.evidence.details.crash_loop_back_off[0].previous_logs' "$SCRIPT_OUTPUT_FILE") + [[ "$prev_logs" != "[]" && "$prev_logs" != "null" ]] || { echo "expected non-empty previous_logs, got $prev_logs"; return 1; } + + contains=$(echo "$prev_logs" | jq -r '.[] | select(test("NullPointerException"))' | head -1) + [[ -n "$contains" ]] || { echo "expected NullPointerException in logs, got $prev_logs"; return 1; } +} + +@test "logs: memory_limits_check embeds previous logs in OOMKilled fact" { + reset_output + setup_pod_logs "oom-pod" "app" "" "java.lang.OutOfMemoryError: Java heap space" + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"oom-pod"}, + "spec":{"containers":[{"name":"app","resources":{"limits":{"memory":"128Mi"}}}]}, + "status":{"containerStatuses":[{"name":"app","lastState":{"terminated":{"reason":"OOMKilled","exitCode":137}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/memory_limits_check" || true + + prev_logs=$(jq -c '.evidence.details.oom_killed[0].previous_logs' "$SCRIPT_OUTPUT_FILE") + contains=$(echo "$prev_logs" | jq -r '.[] | select(test("OutOfMemoryError"))' | head -1) + [[ -n "$contains" ]] || { echo "expected OutOfMemoryError in OOM logs, got $prev_logs"; return 1; } +} + +@test "logs: container_port_health embeds current logs in port_not_listening issue" { + # The script needs nc + timeout to exercise the connectivity-fail path. On + # macOS dev hosts timeout (GNU coreutils) is not in PATH by default — skip. + command -v nc >/dev/null 2>&1 && command -v timeout >/dev/null 2>&1 || \ + skip "nc + timeout required to exercise this path" + + reset_output + setup_pod_logs "broken-pod" "app" "ERROR: failed to bind to 0.0.0.0:8080: permission denied" "" + # Pick a port that's almost certainly not listening on 127.0.0.1 + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"broken-pod"}, + "spec":{"containers":[{"name":"app","ports":[{"containerPort":59999}]}]}, + "status":{"phase":"Running","podIP":"127.0.0.1","containerStatuses":[{"name":"app","ready":true,"state":{"running":{}},"restartCount":0}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/container_port_health" || true + + issue_logs=$(jq -c '.evidence.details.issues[0].container_logs // empty' "$SCRIPT_OUTPUT_FILE") + [[ -n "$issue_logs" && "$issue_logs" != "null" ]] || { echo "expected container_logs in issue"; cat "$SCRIPT_OUTPUT_FILE"; return 1; } + + contains=$(echo "$issue_logs" | jq -r '.[] | select(test("failed to bind"))' | head -1) + [[ -n "$contains" ]] || { echo "expected 'failed to bind' in logs, got $issue_logs"; return 1; } +} + +@test "logs: pod_readiness embeds current logs only for stuck (not_ready) pods" { + # not_ready (failure path) → should embed logs + reset_output + setup_pod_logs "stuck-pod" "app" "INFO: connecting to db... still trying" "" + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"stuck-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Running","conditions":[{"type":"Ready","status":"False","reason":"ContainersNotReady"}],"containerStatuses":[{"name":"app","ready":false,"state":{"running":{}},"restartCount":0}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/pod_readiness" || true + + pod_state=$(jq -r '.evidence.details.pods[0].state' "$SCRIPT_OUTPUT_FILE") + [[ "$pod_state" == "not_ready" ]] || { echo "expected state=not_ready, got $pod_state"; return 1; } + + pod_logs=$(jq -c '.evidence.details.pods[0].container_logs' "$SCRIPT_OUTPUT_FILE") + contains=$(echo "$pod_logs" | jq -r '.[].current_logs[] | select(test("connecting to db"))' | head -1) + [[ -n "$contains" ]] || { echo "expected 'connecting to db' in logs, got $pod_logs"; return 1; } + + # starting (warning path) → should NOT embed logs (avoid noise during normal startup) + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"starting-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Pending","conditions":[{"type":"Ready","status":"False"}],"containerStatuses":[{"name":"app","ready":false,"state":{"waiting":{"reason":"ContainerCreating"}}}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/pod_readiness" || true + + starting_logs=$(jq -c '.evidence.details.pods[0].container_logs' "$SCRIPT_OUTPUT_FILE") + [[ "$starting_logs" == "[]" ]] || { echo "expected empty container_logs for starting pod, got $starting_logs"; return 1; } +} + +@test "logs: success path does not embed logs (keeps payload light)" { + reset_output + cat > "$PODS_FILE" << 'EOF' +{ + "items":[{ + "metadata":{"name":"happy-pod"}, + "spec":{"containers":[{"name":"app"}]}, + "status":{"phase":"Running","conditions":[{"type":"Ready","status":"True"}],"containerStatuses":[{"name":"app","ready":true,"state":{"running":{}},"restartCount":0}]} + }] +} +EOF + source "$BATS_TEST_DIRNAME/../scope/container_crash_detection" || true + + details=$(jq -c '.evidence.details' "$SCRIPT_OUTPUT_FILE") + # Success details should not have crash_loop_back_off populated with logs + has_logs=$(echo "$details" | jq -r '.. | objects | select(has("previous_logs") or has("current_logs")) | "yes"' | head -1) + [[ -z "$has_logs" ]] || { echo "expected no logs in success evidence, got: $details"; return 1; } +} diff --git a/k8s/diagnose/tests/kubectl_get.bats b/k8s/diagnose/tests/kubectl_get.bats new file mode 100644 index 00000000..0057714a --- /dev/null +++ b/k8s/diagnose/tests/kubectl_get.bats @@ -0,0 +1,371 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for kubectl_get - read-only kubectl wrapper for troubleshooting +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + export SCRIPT="$PROJECT_ROOT/k8s/kubectl_get" + export K8S_NAMESPACE="default-ns" + + # Mock kubectl: echo back what was received so tests can assert the args. + kubectl() { + echo "kubectl-called: $*" + return 0 + } + export -f kubectl +} + +teardown() { + unset -f kubectl log + unset K8S_NAMESPACE SCRIPT PROJECT_ROOT +} + +# ============================================================================= +# Usage +# ============================================================================= +@test "kubectl_get: shows usage and exits 1 when no args provided" { + run bash "$SCRIPT" + + [ "$status" -eq 1 ] + assert_contains "$output" "Usage:" + assert_contains "$output" "kubectl get" +} + +# ============================================================================= +# Hardcoded verb: only 'get' can be invoked +# ============================================================================= +@test "kubectl_get: invokes kubectl with 'get' verb followed by user args" { + run bash "$SCRIPT" pods -o wide + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -o wide" +} + +# ============================================================================= +# Default namespace injection +# ============================================================================= +@test "kubectl_get: injects K8S_NAMESPACE when no namespace flag provided" { + run bash "$SCRIPT" pods + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -n default-ns" +} + +@test "kubectl_get: does not inject namespace when -n is provided" { + run bash "$SCRIPT" pods -n kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -n kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when --namespace is provided" { + run bash "$SCRIPT" pods --namespace kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods --namespace kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when --namespace=value form is provided" { + run bash "$SCRIPT" pods --namespace=kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods --namespace=kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when -A is provided" { + run bash "$SCRIPT" pods -A + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -A" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when --all-namespaces is provided" { + run bash "$SCRIPT" pods --all-namespaces + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods --all-namespaces" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_get: does not inject namespace when K8S_NAMESPACE is unset" { + unset K8S_NAMESPACE + + run bash "$SCRIPT" pods + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods" + [[ "$output" != *"-n "* ]] +} + +# ============================================================================= +# Blocked flags +# ============================================================================= +@test "kubectl_get: rejects --server" { + run bash "$SCRIPT" pods --server https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server'" +} + +@test "kubectl_get: rejects --server=value form" { + run bash "$SCRIPT" pods --server=https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server=https://evil.example.com'" +} + +@test "kubectl_get: rejects --kubeconfig" { + run bash "$SCRIPT" pods --kubeconfig /tmp/evil.yaml + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--kubeconfig'" +} + +@test "kubectl_get: rejects --token" { + run bash "$SCRIPT" pods --token abc123 + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +@test "kubectl_get: rejects --as (impersonation)" { + run bash "$SCRIPT" pods --as cluster-admin + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as'" +} + +@test "kubectl_get: rejects --as-group" { + run bash "$SCRIPT" pods --as-group system:masters + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as-group'" +} + +@test "kubectl_get: rejects --context" { + run bash "$SCRIPT" pods --context other-cluster + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--context'" +} + +@test "kubectl_get: rejects --insecure-skip-tls-verify" { + run bash "$SCRIPT" pods --insecure-skip-tls-verify + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--insecure-skip-tls-verify'" +} + +@test "kubectl_get: rejects -w (avoid hangs)" { + run bash "$SCRIPT" pods -w + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '-w'" +} + +@test "kubectl_get: rejects --watch" { + run bash "$SCRIPT" pods --watch + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--watch'" +} + +@test "kubectl_get: blocked flag in middle of args is still detected" { + run bash "$SCRIPT" pods -n my-ns --token abc123 -o yaml + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +# ============================================================================= +# Shell injection safety +# ============================================================================= +@test "kubectl_get: passes args verbatim — no shell interpretation of metachars" { + # If any of these metachars were interpreted by a shell, kubectl would + # never see them as part of a single arg. Mock echoes args back as-is. + run bash "$SCRIPT" pods -l 'app=foo;bar|baz`whoami`' + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: get pods -l app=foo;bar|baz\`whoami\` -n default-ns" +} + +# ============================================================================= +# Exit code propagation +# ============================================================================= +@test "kubectl_get: propagates kubectl exit code on failure" { + kubectl() { + echo "Error from server (NotFound): pods 'foo' not found" >&2 + return 1 + } + export -f kubectl + + run bash "$SCRIPT" pods foo + + [ "$status" -eq 1 ] +} + +# ============================================================================= +# Secret content stripping +# ============================================================================= +# Mock that returns realistic secret JSON when invoked with secret + -o json. +mock_kubectl_with_secrets() { + kubectl() { + if [[ "$*" == *"secret"* && "$*" == *"-o json"* ]]; then + # Single secret (when name is in args) returns object; otherwise list. + if [[ "$*" == *"secret foo"* || "$*" == *"secret/foo"* ]]; then + cat <<'EOF' +{ + "metadata": {"name": "foo", "namespace": "default-ns"}, + "type": "Opaque", + "data": {"password": "c3VwZXJzZWNyZXQ="}, + "stringData": {"plain": "alsosecret"} +} +EOF + else + cat <<'EOF' +{ + "items": [ + { + "metadata": {"name": "foo", "namespace": "default-ns"}, + "type": "Opaque", + "data": {"password": "c3VwZXJzZWNyZXQ="}, + "stringData": {"plain": "alsosecret"} + } + ] +} +EOF + fi + return 0 + fi + echo "kubectl-called: $*" + } + export -f kubectl +} + +@test "kubectl_get: strips .data and .stringData from secret list output" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secrets + + [ "$status" -eq 0 ] + # Metadata still present + assert_contains "$output" "\"name\": \"foo\"" + assert_contains "$output" "\"type\": \"Opaque\"" + # Sensitive content gone + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] + [[ "$output" != *"alsosecret"* ]] + [[ "$output" != *"\"data\""* ]] + [[ "$output" != *"\"stringData\""* ]] +} + +@test "kubectl_get: strips .data and .stringData from single secret output" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret foo + + [ "$status" -eq 0 ] + assert_contains "$output" "\"name\": \"foo\"" + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] + [[ "$output" != *"alsosecret"* ]] + [[ "$output" != *"\"data\""* ]] +} + +@test "kubectl_get: works for 'secret' (singular) resource name" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret + + [ "$status" -eq 0 ] + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: works for secret/name slash form" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret/foo + + [ "$status" -eq 0 ] + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: works for secret,configmap comma form" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secret,configmap + + [ "$status" -eq 0 ] + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: forces -o json when user requested -o yaml on secrets" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" secrets -o yaml + + [ "$status" -eq 0 ] + assert_contains "$output" "Output forced to JSON" + [[ "$output" != *"c3VwZXJzZWNyZXQ="* ]] +} + +@test "kubectl_get: rejects -o jsonpath on secrets" { + run bash "$SCRIPT" secrets -o "jsonpath={.items[*].data.password}" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" + assert_contains "$output" "jsonpath" +} + +@test "kubectl_get: rejects -o go-template on secrets" { + run bash "$SCRIPT" secrets -o "go-template={{.items}}" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" + assert_contains "$output" "go-template" +} + +@test "kubectl_get: rejects -o custom-columns on secrets" { + run bash "$SCRIPT" secrets -o "custom-columns=NAME:.metadata.name,DATA:.data" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" + assert_contains "$output" "custom-columns" +} + +@test "kubectl_get: rejects --output=jsonpath= on secrets" { + run bash "$SCRIPT" secrets --output="jsonpath={.items[*].data}" + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing -o" +} + +@test "kubectl_get: secret filtering does not affect non-secret resources" { + mock_kubectl_with_secrets + + run bash "$SCRIPT" pods -o yaml + + [ "$status" -eq 0 ] + # Goes through the normal (non-filtered) path: mock echoes args. + assert_contains "$output" "kubectl-called: get pods -o yaml -n default-ns" +} + +@test "kubectl_get: propagates kubectl failure exit code through jq pipe" { + kubectl() { + echo "Error from server (Forbidden)" >&2 + return 1 + } + export -f kubectl + + run bash "$SCRIPT" secrets + + [ "$status" -eq 1 ] +} diff --git a/k8s/diagnose/tests/kubectl_logs.bats b/k8s/diagnose/tests/kubectl_logs.bats new file mode 100644 index 00000000..088fc9a7 --- /dev/null +++ b/k8s/diagnose/tests/kubectl_logs.bats @@ -0,0 +1,230 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for kubectl_logs - read-only, non-streaming kubectl logs wrapper +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + export SCRIPT="$PROJECT_ROOT/k8s/kubectl_logs" + export K8S_NAMESPACE="default-ns" + + # Mock kubectl: echo back what was received so tests can assert the args. + kubectl() { + echo "kubectl-called: $*" + return 0 + } + export -f kubectl +} + +teardown() { + unset -f kubectl log + unset K8S_NAMESPACE SCRIPT PROJECT_ROOT +} + +# ============================================================================= +# Usage +# ============================================================================= +@test "kubectl_logs: shows usage and exits 1 when no args provided" { + run bash "$SCRIPT" + + [ "$status" -eq 1 ] + assert_contains "$output" "Usage:" + assert_contains "$output" "kubectl logs" +} + +# ============================================================================= +# Hardcoded verb: only 'logs' can be invoked +# ============================================================================= +@test "kubectl_logs: invokes kubectl with 'logs' verb followed by user args" { + run bash "$SCRIPT" my-pod -c my-container + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod -c my-container" +} + +@test "kubectl_logs: passes --tail / --since / --previous through unchanged" { + run bash "$SCRIPT" my-pod --tail 200 --since 1h --previous + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod --tail 200 --since 1h --previous" +} + +# ============================================================================= +# Default namespace injection +# ============================================================================= +@test "kubectl_logs: injects K8S_NAMESPACE when no namespace flag provided" { + run bash "$SCRIPT" my-pod + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod -n default-ns" +} + +@test "kubectl_logs: does not inject namespace when -n is provided" { + run bash "$SCRIPT" my-pod -n kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod -n kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_logs: does not inject namespace when --namespace is provided" { + run bash "$SCRIPT" my-pod --namespace kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod --namespace kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_logs: does not inject namespace when --namespace=value form is provided" { + run bash "$SCRIPT" my-pod --namespace=kube-system + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod --namespace=kube-system" + [[ "$output" != *"-n default-ns"* ]] +} + +@test "kubectl_logs: does not inject namespace when K8S_NAMESPACE is unset" { + unset K8S_NAMESPACE + + run bash "$SCRIPT" my-pod + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs my-pod" + [[ "$output" != *"-n "* ]] +} + +# ============================================================================= +# Streaming flags are blocked +# ============================================================================= +@test "kubectl_logs: rejects -f (would stream)" { + run bash "$SCRIPT" my-pod -f + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '-f'" +} + +@test "kubectl_logs: rejects --follow (would stream)" { + run bash "$SCRIPT" my-pod --follow + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--follow'" +} + +@test "kubectl_logs: rejects --follow=true (would stream)" { + run bash "$SCRIPT" my-pod --follow=true + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--follow=true'" +} + +@test "kubectl_logs: rejects --follow=false too (simpler to block the flag entirely)" { + run bash "$SCRIPT" my-pod --follow=false + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--follow=false'" +} + +# ============================================================================= +# Blocked auth/context flags +# ============================================================================= +@test "kubectl_logs: rejects --server" { + run bash "$SCRIPT" my-pod --server https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server'" +} + +@test "kubectl_logs: rejects --server=value form" { + run bash "$SCRIPT" my-pod --server=https://evil.example.com + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--server=https://evil.example.com'" +} + +@test "kubectl_logs: rejects --kubeconfig" { + run bash "$SCRIPT" my-pod --kubeconfig /tmp/evil.yaml + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--kubeconfig'" +} + +@test "kubectl_logs: rejects --token" { + run bash "$SCRIPT" my-pod --token abc123 + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +@test "kubectl_logs: rejects --as (impersonation)" { + run bash "$SCRIPT" my-pod --as cluster-admin + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as'" +} + +@test "kubectl_logs: rejects --as-group" { + run bash "$SCRIPT" my-pod --as-group system:masters + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--as-group'" +} + +@test "kubectl_logs: rejects --context" { + run bash "$SCRIPT" my-pod --context other-cluster + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--context'" +} + +@test "kubectl_logs: rejects --insecure-skip-tls-verify" { + run bash "$SCRIPT" my-pod --insecure-skip-tls-verify + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--insecure-skip-tls-verify'" +} + +@test "kubectl_logs: blocked flag in middle of args is still detected" { + run bash "$SCRIPT" my-pod -n my-ns --token abc123 --tail 100 + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '--token'" +} + +@test "kubectl_logs: blocked streaming flag in middle of args is still detected" { + run bash "$SCRIPT" my-pod --tail 100 -f --timestamps + + [ "$status" -eq 1 ] + assert_contains "$output" "Refusing argument '-f'" +} + +# ============================================================================= +# Shell injection safety +# ============================================================================= +@test "kubectl_logs: passes args verbatim — no shell interpretation of metachars" { + # If any of these metachars were interpreted by a shell, kubectl would + # never see them as part of a single arg. Mock echoes args back as-is. + run bash "$SCRIPT" -l 'app=foo;bar|baz`whoami`' + + [ "$status" -eq 0 ] + assert_contains "$output" "kubectl-called: logs -l app=foo;bar|baz\`whoami\` -n default-ns" +} + +# ============================================================================= +# Exit code propagation +# ============================================================================= +@test "kubectl_logs: propagates kubectl exit code on failure" { + kubectl() { + echo "Error from server (NotFound): pods 'foo' not found" >&2 + return 1 + } + export -f kubectl + + run bash "$SCRIPT" foo + + [ "$status" -eq 1 ] +} diff --git a/k8s/diagnose/tests/logs/application_log_evidence.bats b/k8s/diagnose/tests/logs/application_log_evidence.bats new file mode 100644 index 00000000..6daaff8c --- /dev/null +++ b/k8s/diagnose/tests/logs/application_log_evidence.bats @@ -0,0 +1,242 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for diagnose/logs/application_log_evidence +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + source "$BATS_TEST_DIRNAME/../../utils/diagnose_utils" + + export NAMESPACE="test-ns" + export LABEL_SELECTOR="app=test" + export NP_OUTPUT_DIR="$(mktemp -d)" + export SCRIPT_OUTPUT_FILE="$(mktemp)" + export SCRIPT_LOG_FILE="$(mktemp)" + echo '{"status":"pending","evidence":{},"logs":[]}' > "$SCRIPT_OUTPUT_FILE" + + export PODS_FILE="$(mktemp)" + export DATA_DIR="$(mktemp -d)" + export POD_LOGS_DIR="$DATA_DIR/pod_logs" + export PROBLEMATIC_PODS_FILE="$DATA_DIR/problematic_pods.txt" + mkdir -p "$POD_LOGS_DIR" + export EVIDENCE_LOG_TAIL_LINES=50 +} + +teardown() { + rm -rf "$NP_OUTPUT_DIR" "$DATA_DIR" + rm -f "$SCRIPT_OUTPUT_FILE" "$SCRIPT_LOG_FILE" "$PODS_FILE" +} + +evidence() { + jq -r "$1" "$SCRIPT_OUTPUT_FILE" +} + +# ============================================================================= +# Snapshot-unavailable path +# ============================================================================= +@test "logs/application_log_evidence: skipped when PROBLEMATIC_PODS_FILE missing" { + rm -f "$PROBLEMATIC_PODS_FILE" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.status')" = "skipped" ] + [ "$(evidence '.evidence.severity')" = "info" ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] +} + +# ============================================================================= +# No problematic pods +# ============================================================================= +@test "logs/application_log_evidence: success with zero counters when no problematic pods" { + : > "$PROBLEMATIC_PODS_FILE" + echo '{"items":[]}' > "$PODS_FILE" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.status')" = "success" ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "0" ] + assert_contains "$(evidence '.evidence.summary')" "No problematic pods" +} + +# ============================================================================= +# Focuses on the application container only — sidecars are not echoed +# ============================================================================= +@test "logs/application_log_evidence: echoes only application logs (ignores sidecars)" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{ + "metadata":{"name":"pod-1"}, + "spec":{"containers":[{"name":"http"},{"name":"application"}]} +}]} +EOF + echo "nginx sidecar noise" > "$POD_LOGS_DIR/pod-1.http.log" + printf 'starting...\nERROR: missing DATABASE_URL\n' > "$POD_LOGS_DIR/pod-1.application.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # Header + lines prefixed with "| " appear in stdout (captured by UI logs[]) + assert_contains "$output" "application log tail from pod-1" + assert_contains "$output" "| starting..." + assert_contains "$output" "| ERROR: missing DATABASE_URL" + # Sidecar must NOT leak + if [[ "$output" == *"nginx sidecar noise"* ]]; then + echo "Sidecar log leaked into stdout" + return 1 + fi + # Affected lists the pod, counters reflect success + [ "$(evidence '.evidence.affected[0]')" = "pod-1" ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "1" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "1" ] +} + +# ============================================================================= +# Evidence has NO log text — only counters +# ============================================================================= +@test "logs/application_log_evidence: evidence.details exposes only counters, never log text" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + echo "secret log line that must not appear in evidence" > "$POD_LOGS_DIR/pod-1.application.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # details has only the two counters, no pods array, no logs field + local keys + keys=$(jq -r '.evidence.details | keys | sort | join(",")' "$SCRIPT_OUTPUT_FILE") + [ "$keys" = "pods_with_logs,problematic_pod_count" ] + # The log text must not appear anywhere in the evidence object + if [[ "$(jq -c '.evidence' "$SCRIPT_OUTPUT_FILE")" == *"secret log line"* ]]; then + echo "Log text leaked into evidence" + return 1 + fi + # But it MUST appear in stdout + assert_contains "$output" "| secret log line" +} + +# ============================================================================= +# Previous + current are merged into a single chronological stream on stdout +# ============================================================================= +@test "logs/application_log_evidence: stdout shows previous logs first, then current" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + echo "current run" > "$POD_LOGS_DIR/pod-1.application.log" + echo "previous crash output" > "$POD_LOGS_DIR/pod-1.application.previous.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # Both lines must appear in stdout + assert_contains "$output" "| previous crash output" + assert_contains "$output" "| current run" + # And previous must come before current + local prev_line current_line + prev_line=$(printf '%s\n' "$output" | grep -n "previous crash output" | head -1 | cut -d: -f1) + current_line=$(printf '%s\n' "$output" | grep -n "current run" | head -1 | cut -d: -f1) + [ "$prev_line" -lt "$current_line" ] || { echo "Expected previous to print before current"; return 1; } +} + +# ============================================================================= +# Caps logs to last 50 lines (EVIDENCE_LOG_TAIL_LINES default) +# ============================================================================= +@test "logs/application_log_evidence: caps echoed logs to the last 50 lines" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + for i in $(seq 1 30); do echo "prev-$i" >> "$POD_LOGS_DIR/pod-1.application.previous.log"; done + for i in $(seq 1 30); do echo "curr-$i" >> "$POD_LOGS_DIR/pod-1.application.log"; done + + export EVIDENCE_LOG_TAIL_LINES=50 + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + # 60 input lines, capped at 50 → the first 10 previous lines must drop off + if [[ "$output" == *"| prev-1"$'\n'* || "$output" == *"| prev-1 "* ]]; then + : # 'prev-1' is a prefix of 'prev-10', need stricter match + fi + # Stricter check: 'prev-10' should not appear because only prev-11..30 + curr-1..30 fit + if printf '%s\n' "$output" | grep -qE '\| prev-10$'; then + echo "Expected prev-10 to be dropped (out of tail-50 window)" + return 1 + fi + # But prev-11 should be there (first survivor) + printf '%s\n' "$output" | grep -qE '\| prev-11$' || { echo "Expected prev-11 to survive"; return 1; } + # And the latest current line is the last visible + printf '%s\n' "$output" | grep -qE '\| curr-30$' || { echo "Expected curr-30 to survive"; return 1; } +} + +# ============================================================================= +# Pod without application container is skipped +# ============================================================================= +@test "logs/application_log_evidence: skips pods that have no application container" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"sidecar-only"}]}}]} +EOF + echo "irrelevant" > "$POD_LOGS_DIR/pod-1.sidecar-only.log" + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "1" ] + assert_contains "$(evidence '.evidence.summary')" "No application logs available" +} + +# ============================================================================= +# Pod has application container but it produced no logs +# ============================================================================= +@test "logs/application_log_evidence: drops pod whose application container has no logs" { + echo "pod-1" > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{"items":[{"metadata":{"name":"pod-1"},"spec":{"containers":[{"name":"application"}]}}]} +EOF + # No log files + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "0" ] + assert_contains "$(evidence '.evidence.summary')" "image may never have started" +} + +# ============================================================================= +# Multiple pods aggregated +# ============================================================================= +@test "logs/application_log_evidence: aggregates affected across multiple pods" { + printf 'pod-a\npod-b\npod-c\n' > "$PROBLEMATIC_PODS_FILE" + cat > "$PODS_FILE" <<'EOF' +{ + "items":[ + {"metadata":{"name":"pod-a"},"spec":{"containers":[{"name":"application"}]}}, + {"metadata":{"name":"pod-b"},"spec":{"containers":[{"name":"application"}]}}, + {"metadata":{"name":"pod-c"},"spec":{"containers":[{"name":"application"}]}} + ] +} +EOF + echo "log of A" > "$POD_LOGS_DIR/pod-a.application.log" + echo "log of C" > "$POD_LOGS_DIR/pod-c.application.log" + # pod-b has no log file + + run bash -c "source '$BATS_TEST_DIRNAME/../../utils/diagnose_utils' && source '$BATS_TEST_DIRNAME/../../logs/application_log_evidence'" + + [ "$status" -eq 0 ] + [ "$(evidence '.evidence.details.pods_with_logs')" = "2" ] + [ "$(evidence '.evidence.details.problematic_pod_count')" = "3" ] + local affected + affected=$(evidence '.evidence.affected | sort | join(",")') + [ "$affected" = "pod-a,pod-c" ] + # Both visible in stdout, pod-b absent + assert_contains "$output" "| log of A" + assert_contains "$output" "| log of C" +} diff --git a/k8s/diagnose/tests/scope/container_crash_detection.bats b/k8s/diagnose/tests/scope/container_crash_detection.bats index c0a17c44..2ee31844 100644 --- a/k8s/diagnose/tests/scope/container_crash_detection.bats +++ b/k8s/diagnose/tests/scope/container_crash_detection.bats @@ -268,3 +268,103 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "failed" } + +# ============================================================================= +# Evidence Schema Tests +# ============================================================================= +@test "scope/container_crash_detection: success evidence follows schema" { + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "healthy-pod"}, + "status": {"containerStatuses": [{"name": "app", "ready": true, "restartCount": 0, "state": {"running": {}}}]} + }] +} +EOF + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + assert_equal "$severity" "info" + + summary=$(jq -r '.evidence.summary' "$SCRIPT_OUTPUT_FILE") + assert_contains "$summary" "running without crashes" + + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + assert_equal "$affected" "[]" + + pods_checked=$(jq -r '.evidence.details.pods_checked' "$SCRIPT_OUTPUT_FILE") + assert_equal "$pods_checked" "1" +} + +@test "scope/container_crash_detection: failed evidence includes affected pods and crash details" { + cat > "$PODS_FILE" << 'EOF' +{ + "items": [ + { + "metadata": {"name": "crash-1"}, + "status": {"containerStatuses": [{"name": "app", "restartCount": 5, "state": {"waiting": {"reason": "CrashLoopBackOff"}}, "lastState": {"terminated": {"exitCode": 137, "reason": "OOMKilled"}}}]} + }, + { + "metadata": {"name": "healthy"}, + "status": {"containerStatuses": [{"name": "app", "ready": true, "restartCount": 0, "state": {"running": {}}}]} + } + ] +} +EOF + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + assert_equal "$severity" "critical" + + affected=$(jq -c '.evidence.affected' "$SCRIPT_OUTPUT_FILE") + assert_contains "$affected" "crash-1" + + oom_count=$(jq -r '.evidence.details.counts.oom_killed' "$SCRIPT_OUTPUT_FILE") + assert_equal "$oom_count" "1" + + crash_pod=$(jq -r '.evidence.details.crash_loop_back_off[0].pod' "$SCRIPT_OUTPUT_FILE") + assert_equal "$crash_pod" "crash-1" + + exit_code=$(jq -r '.evidence.details.crash_loop_back_off[0].exit_code' "$SCRIPT_OUTPUT_FILE") + assert_equal "$exit_code" "137" + + exit_meaning=$(jq -r '.evidence.details.crash_loop_back_off[0].exit_code_meaning' "$SCRIPT_OUTPUT_FILE") + assert_contains "$exit_meaning" "OOMKilled" + + # Suggested actions should not be empty + actions_count=$(jq -r '.evidence.suggested_actions | length' "$SCRIPT_OUTPUT_FILE") + [ "$actions_count" -gt 0 ] +} + +@test "scope/container_crash_detection: summary highlights OOM count when present" { + cat > "$PODS_FILE" << 'EOF' +{ + "items": [{ + "metadata": {"name": "oom-pod"}, + "status": {"containerStatuses": [{"name": "app", "restartCount": 1, "state": {"waiting": {"reason": "CrashLoopBackOff"}}, "lastState": {"terminated": {"exitCode": 137, "reason": "OOMKilled"}}}]} + }] +} +EOF + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + summary=$(jq -r '.evidence.summary' "$SCRIPT_OUTPUT_FILE") + assert_contains "$summary" "OOMKilled" +} + +@test "scope/container_crash_detection: skipped evidence follows schema with info severity" { + echo '{"items":[]}' > "$PODS_FILE" + + source "$BATS_TEST_DIRNAME/../../scope/container_crash_detection" + + status=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") + assert_equal "$status" "skipped" + + severity=$(jq -r '.evidence.severity' "$SCRIPT_OUTPUT_FILE") + assert_equal "$severity" "info" + + summary=$(jq -r '.evidence.summary' "$SCRIPT_OUTPUT_FILE") + assert_contains "$summary" "skipped" +} diff --git a/k8s/diagnose/tests/scope/container_port_health.bats b/k8s/diagnose/tests/scope/container_port_health.bats index fe60c920..b6605ff2 100644 --- a/k8s/diagnose/tests/scope/container_port_health.bats +++ b/k8s/diagnose/tests/scope/container_port_health.bats @@ -187,7 +187,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "failed" - tested=$(jq -r '.evidence.tested' "$SCRIPT_OUTPUT_FILE") + tested=$(jq -r '.evidence.details.containers_tested' "$SCRIPT_OUTPUT_FILE") assert_equal "$tested" "1" } @@ -429,7 +429,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "skipped" - skipped=$(jq -r '.evidence.skipped' "$SCRIPT_OUTPUT_FILE") + skipped=$(jq -r '.evidence.details.containers_skipped' "$SCRIPT_OUTPUT_FILE") assert_equal "$skipped" "1" } @@ -498,7 +498,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "success" - tested=$(jq -r '.evidence.tested' "$SCRIPT_OUTPUT_FILE") + tested=$(jq -r '.evidence.details.containers_tested' "$SCRIPT_OUTPUT_FILE") assert_equal "$tested" "1" unset -f nc timeout diff --git a/k8s/diagnose/tests/scope/health_probe_endpoints.bats b/k8s/diagnose/tests/scope/health_probe_endpoints.bats index 8a53364b..3621480d 100644 --- a/k8s/diagnose/tests/scope/health_probe_endpoints.bats +++ b/k8s/diagnose/tests/scope/health_probe_endpoints.bats @@ -562,7 +562,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "skipped" - skipped=$(jq -r '.evidence.skipped' "$SCRIPT_OUTPUT_FILE") + skipped=$(jq -r '.evidence.details.containers_skipped' "$SCRIPT_OUTPUT_FILE") assert_equal "$skipped" "1" } @@ -675,7 +675,7 @@ EOF result=$(jq -r '.status' "$SCRIPT_OUTPUT_FILE") assert_equal "$result" "success" - tested=$(jq -r '.evidence.tested' "$SCRIPT_OUTPUT_FILE") + tested=$(jq -r '.evidence.details.containers_tested' "$SCRIPT_OUTPUT_FILE") assert_equal "$tested" "1" } diff --git a/k8s/diagnose/tests/scope/pod_readiness.bats b/k8s/diagnose/tests/scope/pod_readiness.bats index 01625e29..c3f459da 100644 --- a/k8s/diagnose/tests/scope/pod_readiness.bats +++ b/k8s/diagnose/tests/scope/pod_readiness.bats @@ -223,8 +223,8 @@ EOF source "$BATS_TEST_DIRNAME/../../scope/pod_readiness" - ready=$(jq -r '.evidence.ready' "$SCRIPT_OUTPUT_FILE") - total=$(jq -r '.evidence.total' "$SCRIPT_OUTPUT_FILE") + ready=$(jq -r '.evidence.details.ready' "$SCRIPT_OUTPUT_FILE") + total=$(jq -r '.evidence.details.total' "$SCRIPT_OUTPUT_FILE") assert_equal "$ready" "1" assert_equal "$total" "1" } diff --git a/k8s/diagnose/utils/diagnose_utils b/k8s/diagnose/utils/diagnose_utils index 836bc67e..94bac3b8 100644 --- a/k8s/diagnose/utils/diagnose_utils +++ b/k8s/diagnose/utils/diagnose_utils @@ -1,5 +1,27 @@ #!/bin/bash +# ============================================================================= +# Evidence schema (passed to update_check_result --evidence) +# ============================================================================= +# All checks must emit evidence following this schema so the backend / AI summarizer +# can consume results uniformly: +# +# { +# "summary": "string — one-line human-readable summary of findings", +# "severity": "critical" | "warning" | "info", +# "affected": ["resource-name", ...], +# "details": { /* check-specific structured data */ }, +# "suggested_actions": ["actionable guidance items"] +# } +# +# severity mapping: +# - critical: status=failed with actionable data (e.g. pods OOMKilled) +# - warning: status=warning, or partial issues +# - info: status=success or skipped (no action required) +# +# Helper `evidence_json` below builds this schema from primitives. +# ============================================================================= + # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' @@ -7,6 +29,153 @@ YELLOW='\033[1;33m' CYAN='\033[0;36m' NC='\033[0m' # No Color +# Build a JSON evidence object following the schema documented above. +# Usage: +# evidence_json +# Where: +# - summary: plain string +# - severity: "critical" | "warning" | "info" +# - affected_json: JSON array of resource names, e.g. '["pod-1","pod-2"]' +# - details_json: JSON object, check-specific. Pass '{}' for none. +# - actions_json: JSON array of strings. Pass '[]' for none. +evidence_json() { + local summary="$1" + local severity="$2" + local affected="$3" + local details="$4" + local actions="$5" + + # Explicit defaults: bash's ${var:-{}} mis-parses the closing brace, so we + # branch instead of using parameter substitution. + [[ -z "$affected" ]] && affected="[]" + [[ -z "$details" ]] && details="{}" + [[ -z "$actions" ]] && actions="[]" + + jq -n \ + --arg summary "$summary" \ + --arg severity "$severity" \ + --argjson affected "$affected" \ + --argjson details "$details" \ + --argjson actions "$actions" \ + '{ + summary: $summary, + severity: $severity, + affected: $affected, + details: $details, + suggested_actions: $actions + }' +} + +# Translate a Linux container exit code into a human-readable meaning. +# Returns "Unknown" for codes we don't classify. +exit_code_meaning() { + case "$1" in + 0) echo "Clean exit (container finished successfully)" ;; + 1) echo "Application error" ;; + 137) echo "OOMKilled (out of memory)" ;; + 139) echo "SIGSEGV (segmentation fault)" ;; + 143) echo "SIGTERM (graceful termination)" ;; + *) echo "Unknown" ;; + esac +} + +# Read the tail of a pre-collected pod log into a JSON array of lines (one +# line per element). Reads from the pod_logs/ snapshot collected by +# build_context — never makes live kubectl calls. +# +# Usage: +# read_log_tail [lines] +# which: "current" or "previous" +# lines: how many lines to take from the tail (default: $EVIDENCE_LOG_TAIL_LINES, fallback 50) +# +# Returns "[]" if: +# - POD_LOGS_DIR is unset (build_context did not run, or this is a unit test) +# - the log file does not exist (most containers do not have a previous log) +# - the file is empty (container produced no output yet) +# +# Why a snapshot rather than live kubectl: build_context took the snapshot +# point-in-time at fail-time (diagnose runs before rollback). By the time the +# AI summarizer reads evidence, the cluster state has likely moved on, so live +# logs would be misleading or missing. +read_log_tail() { + local pod="$1" + local container="$2" + local which="$3" + local lines="${4:-${EVIDENCE_LOG_TAIL_LINES:-50}}" + + [[ -z "$POD_LOGS_DIR" ]] && { echo "[]"; return 0; } + + local log_file + case "$which" in + previous) log_file="$POD_LOGS_DIR/${pod}.${container}.previous.log" ;; + current) log_file="$POD_LOGS_DIR/${pod}.${container}.log" ;; + *) echo "[]"; return 0 ;; + esac + + if [[ ! -s "$log_file" ]]; then + echo "[]" + return 0 + fi + + tail -n "$lines" "$log_file" | lines_to_json_array +} + +# Convert newline-delimited stdin into a JSON array of non-empty strings. +# Used by read_log_tail and update_check_result to share one canonical +# tail-text-to-JSON pipeline. +lines_to_json_array() { + jq -R -s 'split("\n") | map(select(length > 0))' +} + +# Append a JSON object to a bash indexed array (passed by name). Avoids the +# O(N²) jq round-trip of `arr=$(echo "$arr" | jq --argjson f "$x" '. + [$f]')`. +# Convert with `facts_to_json_array ` once at end of accumulation. +# Uses eval so it works on bash 3.2 (macOS dev) — declare -n requires 4.3+. +add_fact() { + local arr_name="$1" + local value="$2" + eval "${arr_name}+=(\"\$value\")" +} + +# Convert a bash indexed array of compact JSON strings into a single JSON +# array. Empty arrays correctly become "[]". +facts_to_json_array() { + local arr_name="$1" + local count + eval "count=\${#${arr_name}[@]}" + if [[ "$count" -eq 0 ]]; then + echo "[]" + else + eval "printf '%s\n' \"\${${arr_name}[@]}\"" | jq -s '.' + fi +} + +# Mark a resource as affected by an issue. Stores names in a bash variable +# (passed by name) as a space-separated set, deduplicating on add. Replaces +# the per-call jq dedup that was duplicated in every check. +mark_affected() { + local set_name="$1" + local value="$2" + local current + eval "current=\"\${$set_name}\"" + case " $current " in + *" $value "*) ;; + *) eval "$set_name=\"\${current:+\$current }\$value\"" ;; + esac +} + +# Convert a space-separated set (built by mark_affected) into a JSON array. +set_to_json_array() { + local set_name="$1" + local values + eval "values=\"\${$set_name}\"" + if [[ -z "$values" ]]; then + echo "[]" + else + printf '%s\n' $values | jq -R . | jq -s . + fi +} + print_success() { echo -e "${GREEN}✓${NC} $1" } @@ -37,7 +206,15 @@ require_resources() { if [[ -z "$resource_names" ]]; then print_warning "No ${resource_type} found with labels $label_selector in namespace $namespace, check was skipped." - update_check_result --status "skipped" --evidence "{}" + local skip_evidence + skip_evidence=$(evidence_json \ + "No ${resource_type} found, check skipped" \ + "info" \ + "[]" \ + "$(jq -nc --arg rt "$resource_type" --arg ls "$label_selector" --arg ns "$namespace" \ + '{resource_type: $rt, label_selector: $ls, namespace: $ns}')" \ + "[]") + update_check_result --status "skipped" --evidence "$skip_evidence" return 1 fi @@ -66,7 +243,14 @@ update_check_result() { # Usage: # update_check_result "new-status" '{"new":"evidence"}' # or: - # update_check_result --status "new-status" --evidence '{"new":"evidence"}' + # update_check_result --status "new-status" --evidence '{"new":"evidence"}' [--log-tail-lines N] + # + # --log-tail-lines overrides the default 20-line cap on the captured stdout + # tail. Checks that publish application output (e.g. logs/application_log_evidence) + # need a higher cap to fit the log payload alongside their own diagnostic + # prints. + + local log_tail_lines=20 # Argument parsing if [[ "$1" == --* ]]; then @@ -74,6 +258,7 @@ update_check_result() { case $1 in --status) status="$2"; shift 2 ;; --evidence) evidence="$2"; shift 2 ;; + --log-tail-lines) log_tail_lines="$2"; shift 2 ;; *) echo "Unknown parameter: $1" >&2; return 1 ;; esac done @@ -95,11 +280,10 @@ update_check_result() { return 1 fi - # Check if log file exists and read it into an array + # Read script log tail (non-blank lines, capped at $log_tail_lines) into a JSON array local log_array="[]" if [[ -n "$SCRIPT_LOG_FILE" && -f "$SCRIPT_LOG_FILE" ]]; then - # Read log file, remove empty lines, take last 20 lines, and convert to JSON array - log_array=$(grep -v '^[[:space:]]*$' "$SCRIPT_LOG_FILE" | tail -n 20 | jq -R -s 'split("\n") | map(select(length > 0))') + log_array=$(grep -v '^[[:space:]]*$' "$SCRIPT_LOG_FILE" | tail -n "$log_tail_lines" | lines_to_json_array) if [[ $? -ne 0 ]]; then echo "Error: Failed to read log file: $SCRIPT_LOG_FILE" >&2 return 1 diff --git a/k8s/docs/configurable-http-ports.md b/k8s/docs/configurable-http-ports.md new file mode 100644 index 00000000..a3a0b2f8 --- /dev/null +++ b/k8s/docs/configurable-http-ports.md @@ -0,0 +1,135 @@ +# Configurable HTTP Ports + +The k8s scope supports configuring the port on which the application's main HTTP listener binds, and exposing additional HTTP ports as siblings of the main listener. + +## Capabilities + +### `main_http_port` + +- **Type:** integer +- **Default:** `8080` +- **Range:** 1024 – 65535 +- **Required:** yes (with default — the form pre-fills 8080) + +The port your application binds to inside the container. When set, the following are derived from it automatically: + +| Resource | Field | Value | +|---|---|---| +| `Deployment` (application container) | `containerPort` | `main_http_port` | +| `Deployment` (application container) | livenessProbe / readinessProbe / startupProbe port | `main_http_port` | +| `Deployment` (http traffic-manager sidecar) | `UPSTREAM_PORT` env | `main_http_port` | +| `Deployment` (http traffic-manager sidecar) | TCP probe `app_port` | `main_http_port` | +| `Service` | `port` (cluster-public) | `main_http_port` | +| `Ingress` (initial and blue-green) | backend service port | `main_http_port` | +| Istio `Service` and `HTTPRoute` | port | `main_http_port` | + +`Service.targetPort` stays `80` because that is the sidecar's port, not the app's. + +### `additional_ports[].type = "HTTP"` + +`additional_ports` is a list of extra ports the scope exposes alongside the main HTTP listener. Each item has: + +- `port`: integer 1024–65535 +- `type`: `"GRPC"` or `"HTTP"` + +For each additional port (HTTP or GRPC), the deployment generates a traffic-manager sidecar that handles external traffic. The sidecar is **always** in the request path: it adds nginx-level metrics, graceful-shutdown handling, and body-size limits. + +The architecture differs slightly between HTTP and GRPC because of how the application is expected to bind ports: + +### HTTP additional port — same model as `main_http_port` + +The application **binds the additional port directly** (e.g., `app.listen(9090)`), exactly the way it binds `main_http_port`. The sidecar bindes a different *internal* port, `port + 10000`, to avoid colliding with the application. K8s `Service` exposes `port` externally and routes to the sidecar's internal port; the sidecar then proxies to the application on `port`. + +For example, with `main_http_port=8081` and `additional_port: {port: 9090, type: HTTP}`: + +``` +External client + │ http://service:9090 + â–ŧ +K8s Service "d-{scope}-{deploy}-http-9090" port: 9090, targetPort: 19090 + │ + â–ŧ +Sidecar container "http-9090" listens on 19090 → proxies to localhost:9090 + │ + â–ŧ +Application container binds 9090 (and also 8081 for the main listener) +``` + +The application sees two real listeners: `8081` (main) and `9090` (additional). External traffic to either flows through its respective sidecar (the main `http` sidecar for `8081`, the `http-9090` sidecar for `9090`). + +**Constraint:** because the sidecar uses `port + 10000`, the additional port must be `≤ 55535` for HTTP. Above that the offset overflows the 65535 max TCP port. + +### GRPC additional port — sidecar terminates protocol + +The application does **NOT** bind GRPC additional ports. The `grpc-{port}` sidecar binds `{port}` directly and translates gRPC into HTTP, proxying to `localhost:main_http_port`. The application speaks only HTTP on `main_http_port` and serves both main HTTP traffic and any incoming gRPC requests (received already translated to HTTP). + +### Summary + +| | HTTP additional port | GRPC additional port | +|---|---|---| +| App binds the port | yes, directly | no (sidecar binds it) | +| Sidecar internal port | `port + 10000` | `port` | +| Service `port` (external) | `port` | `port` | +| Service `targetPort` | `port + 10000` (sidecar) | `port` (sidecar) | +| Sidecar `UPSTREAM_PORT` | `port` (the app's same port) | `main_http_port` (default in image) | +| Protocol translation | none | gRPC → HTTP | +| Max valid `port` | 55535 | 65535 | + +## ALB capacity and listener lifecycle + +### Each additional port opens its own ALB listener + +The Ingress generated for each additional port (HTTP or GRPC) declares `alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":{port}}]'`. This means **every additional port translates into a dedicated listener on the shared ALB** (`spec.ports[].port == {scope additional port}`). The main scope ingress keeps its standard `[{"HTTP":80},{"HTTPS":443}]` listener pair. + +Concrete example for an ALB shared by three scopes, each with `main_http_port=8081` plus one HTTP additional port `9090`, `9091`, and `9092` respectively: + +| ALB listener | Source ingress | Backend | +|---|---|---| +| `:80` | All scopes (main) | Main sidecar `http` | +| `:443` | All scopes (main) | Main sidecar `http` | +| `:9090` | scope A `http-9090` ingress | Sidecar `http-9090` of scope A | +| `:9091` | scope B `http-9091` ingress | Sidecar `http-9091` of scope B | +| `:9092` | scope C `http-9092` ingress | Sidecar `http-9092` of scope C | + +The main listeners (80/443) are shared across all scopes via the IngressGroup; one listener serves many ingress rules (one per scope host). Additional ports are NOT shared by default — each port is a separate listener. + +### AWS limit: 50 listeners per ALB + +This is an AWS hard quota. With many scopes using additional ports on the same ALB, the listener count climbs quickly: each scope adds 1 listener per HTTP/GRPC additional port. A pre-flight check in `k8s/deployment/validate_alb_target_group_capacity` rejects deployments when the ALB would exceed `ALB_MAX_LISTENERS` (default `48`, leaves 2 slots of headroom before the AWS limit). The threshold is configurable in `values.yaml` or via the `scope-configurations`/`container-orchestration` provider. + +If a deployment fails with `❌ ALB 'NAME' has reached listener capacity: X/48`, the operator options are: +- Reduce `additional_ports` across the scopes sharing the ALB +- Increase `ALB_MAX_LISTENERS` (only safe up to 49 — at 50 the next deploy will hit the AWS quota itself) +- Request an AWS service-quota increase for listeners per ALB (the limit is technically adjustable, although AWS tends to deny large increases) +- Move some scopes to a separate ALB (the recommended path) + +### Listeners are cleaned up automatically + +Operators do not need to manage ALB listeners by hand. The AWS Load Balancer Controller owns listener lifecycle through the IngressGroup mechanism: + +- When the **first** Ingress with `alb.ingress.kubernetes.io/listen-ports` referencing a given port is created, the controller adds that listener to the shared ALB. +- When the **last** Ingress referencing that port is deleted, the controller removes the listener. +- In between, multiple Ingresses on the same port coexist as different rules on a single listener; the controller never duplicates the listener itself. + +This means deleting a deployment (which deletes its Ingresses) is sufficient to reclaim listener capacity — no manual cleanup of the ALB is required. If a scope is the only consumer of a particular additional port across the ALB, deleting that scope returns the listener to the pool and frees an `ALB_MAX_LISTENERS` slot for the next deployment. + +## Backward Compatibility + +- Existing scopes that do not set `main_http_port` get `8080` automatically via the JSON Schema default and the `// 8080` jq fallback in `build_context`. No migration is required. +- The `traffic-manager` image's `start.sh` defaults `UPSTREAM_PORT` to `8080` when the env is not provided, so an upgraded image with un-upgraded scope templates continues to behave like the old image. +- Adding `HTTP` to the `additional_ports.type` enum is strictly additive — existing entries with `"GRPC"` remain valid. + +## Implementation Map + +- JSON Schema and UI Schema: `k8s/specs/service-spec.json.tpl` +- Build context extraction: `k8s/deployment/build_context` (look for `MAIN_HTTP_PORT`) +- Templates that consume `main_http_port`: `k8s/deployment/templates/{service,deployment,initial-ingress,blue-green-ingress}.yaml.tpl` and `k8s/deployment/templates/istio/*.tpl` +- HTTP additional_ports sidecar: `k8s/deployment/templates/deployment.yaml.tpl` (look for `else if eq .type "HTTP"`) +- traffic-manager image: `nullplatform/k8s-tools/traffic-manager` — `UPSTREAM_PORT` env handled in `start.sh` + +## Tests + +- `k8s/deployment/tests/build_context.bats` covers `main_http_port` extraction with present, absent, and `null` cases, and verifies the `tonumber` cast. +- `k8s/deployment/tests/ingress_template_shape.bats` verifies the per-port HTTPS listener annotation on each ingress branch and pins the absence of `ssl-redirect` on additional-port ingresses. +- `k8s/deployment/tests/verify_ingress_reconciliation.bats` covers the weight-dedupe behavior introduced because a shared ALB listener used to surface multiple matching rules (the multi-rule scenario is no longer reachable now that each additional port has its own listener, but the dedupe is kept defensively). +- `k8s/deployment/tests/validate_alb_target_group_capacity.bats` covers both target-group capacity and the listener-capacity validation (`ALB_MAX_LISTENERS`). diff --git a/k8s/kubectl_get b/k8s/kubectl_get new file mode 100755 index 00000000..8a05f061 --- /dev/null +++ b/k8s/kubectl_get @@ -0,0 +1,166 @@ +#!/bin/bash +# Read-only kubectl wrapper for troubleshooting. +# Hardcodes `kubectl get` so no other verb can be invoked, and rejects flags +# that change auth/server/context or could hang the script. When no namespace +# flag is supplied, defaults to $K8S_NAMESPACE so the agent can target the +# scope's namespace without repeating it. +# +# When 'secret' / 'secrets' is in the resource args, output is forced to JSON +# and piped through jq to strip .data and .stringData (the only fields that +# carry secret values). Output formats that can extract those fields directly +# (jsonpath, go-template, custom-columns) are rejected for secret queries. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [[ "$(type -t log 2>/dev/null)" != "function" ]]; then source "$SCRIPT_DIR/logging"; fi + +usage() { + cat >&2 < + +Runs 'kubectl get' with the provided arguments. Read-only: any other verb +or auth/context override is rejected. Secret values are stripped from output. + +Examples: + $(basename "$0") pods + $(basename "$0") pods -l app=foo -o yaml + $(basename "$0") events --sort-by=.lastTimestamp -n kube-system + $(basename "$0") nodes -A + $(basename "$0") secrets # data fields stripped +EOF +} + +if [[ $# -eq 0 ]]; then + usage + exit 1 +fi + +BLOCKED_FLAGS=( + --server + --kubeconfig + --token + --as + --as-group + --as-uid + --certificate-authority + --client-certificate + --client-key + --username + --password + --user + --cluster + --context + --insecure-skip-tls-verify + -w + --watch + --watch-only +) + +is_blocked() { + local flag_name="${1%%=*}" + for blocked in "${BLOCKED_FLAGS[@]}"; do + if [[ "$flag_name" == "$blocked" ]]; then + return 0 + fi + done + return 1 +} + +# True if any arg references the 'secret' / 'secrets' resource. Handles +# comma-separated resources (secret,configmap), slash form (secret/foo) +# and apiVersion form (secrets.v1). Uppercase tolerated since kubectl is +# case-insensitive on resource names. +involves_secrets() { + local arg lower token res + for arg in "$@"; do + lower=$(echo "$arg" | tr '[:upper:]' '[:lower:]') + local IFS=, + for token in $lower; do + res="${token%%/*}" + res="${res%%.*}" + if [[ "$res" == "secret" || "$res" == "secrets" ]]; then + return 0 + fi + done + unset IFS + done + return 1 +} + +# Echoes the value of -o / --output if set; empty otherwise. +get_output_format() { + local prev="" arg + for arg in "$@"; do + if [[ "$prev" == "-o" || "$prev" == "--output" ]]; then + printf '%s\n' "$arg" + return + fi + case "$arg" in + -o=*|--output=*) printf '%s\n' "${arg#*=}"; return ;; + esac + prev="$arg" + done +} + +HAS_NAMESPACE_FLAG=false + +for arg in "$@"; do + if is_blocked "$arg"; then + log error "❌ Refusing argument '$arg': flag is blocked to keep this script read-only and safe." + log error " Blocked flags: ${BLOCKED_FLAGS[*]}" + exit 1 + fi + + case "${arg%%=*}" in + -n|--namespace|-A|--all-namespaces) + HAS_NAMESPACE_FLAG=true + ;; + esac +done + +ARGS=("$@") + +if [[ "$HAS_NAMESPACE_FLAG" == "false" ]] && [[ -n "${K8S_NAMESPACE:-}" ]]; then + ARGS+=(-n "$K8S_NAMESPACE") +fi + +if involves_secrets "${ARGS[@]}"; then + USER_OUTPUT="$(get_output_format "${ARGS[@]}")" + + case "$USER_OUTPUT" in + jsonpath*|go-template*|custom-columns*) + log error "❌ Refusing -o '$USER_OUTPUT' on secrets: this format can extract .data directly, bypassing the safety filter." + log error " Use -o yaml, -o json, -o wide, or no -o flag — values are stripped from data/stringData." + exit 1 + ;; + esac + + # Strip user's -o flag (we force JSON to feed jq). + STRIPPED=() + prev="" + for arg in "${ARGS[@]}"; do + if [[ "$prev" == "-o" || "$prev" == "--output" ]]; then + prev="" + continue + fi + case "$arg" in + -o|--output) prev="$arg"; continue ;; + -o=*|--output=*) prev=""; continue ;; + esac + prev="" + STRIPPED+=("$arg") + done + STRIPPED+=(-o json) + + if [[ -n "$USER_OUTPUT" && "$USER_OUTPUT" != "json" ]]; then + log info "â„šī¸ Output forced to JSON (was: $USER_OUTPUT) — secret data fields are stripped for safety." + fi + + log debug "📋 Running: kubectl get ${STRIPPED[*]} | jq " + + kubectl get "${STRIPPED[@]}" | jq 'if .items then .items |= map(del(.data, .stringData)) else del(.data, .stringData) end' + exit "${PIPESTATUS[0]}" +fi + +log debug "📋 Running: kubectl get ${ARGS[*]}" + +kubectl get "${ARGS[@]}" diff --git a/k8s/kubectl_logs b/k8s/kubectl_logs new file mode 100755 index 00000000..80d36268 --- /dev/null +++ b/k8s/kubectl_logs @@ -0,0 +1,88 @@ +#!/bin/bash +# Read-only kubectl logs wrapper for troubleshooting. +# Hardcodes `kubectl logs` so no other verb can be invoked, and rejects +# flags that change auth/server/context or turn the call into a stream +# (--follow / -f), which would hang the script. When no namespace flag +# is supplied, defaults to $K8S_NAMESPACE so the agent can target the +# scope's namespace without repeating it. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [[ "$(type -t log 2>/dev/null)" != "function" ]]; then source "$SCRIPT_DIR/logging"; fi + +usage() { + cat >&2 < + +Runs 'kubectl logs' with the provided arguments. Read-only: any other +verb, auth/context override, or streaming flag (--follow / -f) is +rejected to keep invocations bounded. + +Examples: + $(basename "$0") my-pod + $(basename "$0") my-pod -c my-container --tail 200 + $(basename "$0") my-pod --previous + $(basename "$0") -l app=foo --tail 100 + $(basename "$0") my-pod --since 1h --timestamps +EOF +} + +if [[ $# -eq 0 ]]; then + usage + exit 1 +fi + +BLOCKED_FLAGS=( + --server + --kubeconfig + --token + --as + --as-group + --as-uid + --certificate-authority + --client-certificate + --client-key + --username + --password + --user + --cluster + --context + --insecure-skip-tls-verify + -f + --follow +) + +is_blocked() { + local flag_name="${1%%=*}" + for blocked in "${BLOCKED_FLAGS[@]}"; do + if [[ "$flag_name" == "$blocked" ]]; then + return 0 + fi + done + return 1 +} + +HAS_NAMESPACE_FLAG=false + +for arg in "$@"; do + if is_blocked "$arg"; then + log error "❌ Refusing argument '$arg': flag is blocked to keep this script read-only and non-streaming." + log error " Blocked flags: ${BLOCKED_FLAGS[*]}" + exit 1 + fi + + case "${arg%%=*}" in + -n|--namespace) + HAS_NAMESPACE_FLAG=true + ;; + esac +done + +ARGS=("$@") + +if [[ "$HAS_NAMESPACE_FLAG" == "false" ]] && [[ -n "${K8S_NAMESPACE:-}" ]]; then + ARGS+=(-n "$K8S_NAMESPACE") +fi + +log debug "📋 Running: kubectl logs ${ARGS[*]}" + +kubectl logs "${ARGS[@]}" diff --git a/k8s/scope/networking/dns/external_dns/manage_route b/k8s/scope/networking/dns/external_dns/manage_route index f4fe1045..204f1dab 100644 --- a/k8s/scope/networking/dns/external_dns/manage_route +++ b/k8s/scope/networking/dns/external_dns/manage_route @@ -6,14 +6,44 @@ if [ "$ACTION" = "CREATE" ]; then log debug "🔍 Building DNSEndpoint manifest for ExternalDNS..." log debug "📡 Getting IP for gateway: $GATEWAY_NAME" - GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ - -o jsonpath='{.status.addresses[?(@.type=="IPAddress")].value}' 2>/dev/null) + # Try ALB Ingress first (AWS-specific: gateway-alb-public / gateway-alb-private) + GATEWAY_SUFFIX="${GATEWAY_NAME#gateway-}" + GATEWAY_IP=$(kubectl get ingress "gateway-alb-${GATEWAY_SUFFIX}" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) + RECORD_TYPE="CNAME" + + if [ -n "$GATEWAY_IP" ]; then + log debug "📡 ALB hostname resolved via Ingress: $GATEWAY_IP" + else + log debug "📡 ALB Ingress not found, resolving gateway address directly..." - if [ -z "$GATEWAY_IP" ]; then - log warn "âš ī¸ Gateway IP not found, trying service fallback..." + GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.addresses[?(@.type=="IPAddress")].value}' 2>/dev/null) + RECORD_TYPE="A" + + if [ -z "$GATEWAY_IP" ]; then + log warn "âš ī¸ Gateway IP not found, trying hostname..." + + GATEWAY_IP=$(kubectl get gateway "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.addresses[?(@.type=="Hostname")].value}' 2>/dev/null) + RECORD_TYPE="CNAME" + fi - GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ - -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null) + if [ -z "$GATEWAY_IP" ]; then + log warn "âš ī¸ Gateway hostname not found, trying service fallback..." + + GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null) + RECORD_TYPE="A" + fi + + if [ -z "$GATEWAY_IP" ]; then + log warn "âš ī¸ Gateway service IP not found, trying service hostname fallback..." + + GATEWAY_IP=$(kubectl get service "$GATEWAY_NAME" -n gateways \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null) + RECORD_TYPE="CNAME" + fi fi if [ -z "$GATEWAY_IP" ]; then @@ -21,7 +51,7 @@ if [ "$ACTION" = "CREATE" ]; then exit 0 fi - log info "✅ Gateway IP: $GATEWAY_IP" + log info "✅ Gateway address: $GATEWAY_IP (recordType: $RECORD_TYPE)" DNS_ENDPOINT_TEMPLATE="${DNS_ENDPOINT_TEMPLATE:-$SERVICE_PATH/deployment/templates/dns-endpoint.yaml.tpl}" @@ -29,7 +59,15 @@ if [ "$ACTION" = "CREATE" ]; then DNS_ENDPOINT_FILE="$OUTPUT_DIR/dns-endpoint-$SCOPE_ID.yaml" CONTEXT_PATH="$OUTPUT_DIR/context-$SCOPE_ID-dns.json" - echo "$CONTEXT" | jq --arg gateway_ip "$GATEWAY_IP" '. + {gateway_ip: $gateway_ip}' > "$CONTEXT_PATH" + if [ "${SCOPE_VISIBILITY:-}" = "public" ]; then + DNS_ZONE_TYPE="public" + else + DNS_ZONE_TYPE="private" + fi + + echo "$CONTEXT" | jq --arg gateway_ip "$GATEWAY_IP" --arg record_type "$RECORD_TYPE" \ + --arg dns_zone_type "$DNS_ZONE_TYPE" \ + '. + {gateway_ip: $gateway_ip, record_type: $record_type, dns_zone_type: $dns_zone_type}' > "$CONTEXT_PATH" log debug "📝 Building DNSEndpoint from template: $DNS_ENDPOINT_TEMPLATE" @@ -57,7 +95,10 @@ elif [ "$ACTION" = "DELETE" ]; then log debug "🔍 Deleting DNSEndpoint for external_dns..." SCOPE_SLUG=$(echo "$CONTEXT" | jq -r '.scope.slug') - DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" + APP_SLUG=$(echo "$CONTEXT" | jq -r '.application.slug') + APP_SLUG_SHORT="${APP_SLUG:0:20}"; APP_SLUG_SHORT="${APP_SLUG_SHORT%-}" + SCOPE_SLUG_SHORT="${SCOPE_SLUG:0:20}"; SCOPE_SLUG_SHORT="${SCOPE_SLUG_SHORT%-}" + DNS_ENDPOINT_NAME="k8s-${APP_SLUG_SHORT}-${SCOPE_SLUG_SHORT}-${SCOPE_ID}-dns" log debug "📝 Deleting DNSEndpoint: $DNS_ENDPOINT_NAME in namespace $K8S_NAMESPACE" kubectl delete dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" || { log warn "âš ī¸ DNSEndpoint '$DNS_ENDPOINT_NAME' may already be deleted" diff --git a/k8s/scope/tests/networking/dns/external_dns/manage_route.bats b/k8s/scope/tests/networking/dns/external_dns/manage_route.bats index db1563b4..e3db72b0 100644 --- a/k8s/scope/tests/networking/dns/external_dns/manage_route.bats +++ b/k8s/scope/tests/networking/dns/external_dns/manage_route.bats @@ -17,7 +17,7 @@ setup() { export SCOPE_ID="scope-123" export SCOPE_DOMAIN="myapp.example.com" export K8S_NAMESPACE="test-ns" - export CONTEXT='{"scope":{"slug":"my-app"}}' + export CONTEXT='{"scope":{"slug":"my-scope"},"application":{"slug":"my-app"}}' export OUTPUT_DIR="$(mktemp -d)" # Mock kubectl - default: gateway returns IP @@ -70,7 +70,7 @@ teardown() { [ "$status" -eq 0 ] assert_contains "$output" "🔍 Building DNSEndpoint manifest for ExternalDNS..." assert_contains "$output" "📡 Getting IP for gateway: gw-public" - assert_contains "$output" "✅ Gateway IP: 10.0.0.1" + assert_contains "$output" "✅ Gateway address: 10.0.0.1 (recordType: A)" assert_contains "$output" "📝 Building DNSEndpoint from template:" assert_contains "$output" "✅ DNSEndpoint manifest created:" } @@ -98,8 +98,8 @@ teardown() { run bash "$SCRIPT" [ "$status" -eq 0 ] - assert_contains "$output" "âš ī¸ Gateway IP not found, trying service fallback..." - assert_contains "$output" "✅ Gateway IP: 10.0.0.2" + assert_contains "$output" "âš ī¸ Gateway hostname not found, trying service fallback..." + assert_contains "$output" "✅ Gateway address: 10.0.0.2 (recordType: A)" } # ============================================================================= @@ -158,7 +158,7 @@ teardown() { [ "$status" -eq 0 ] assert_contains "$output" "🔍 Deleting DNSEndpoint for external_dns..." - assert_contains "$output" "📝 Deleting DNSEndpoint: k-8-s-my-app-scope-123-dns in namespace test-ns" + assert_contains "$output" "📝 Deleting DNSEndpoint: k8s-my-app-my-scope-scope-123-dns in namespace test-ns" assert_contains "$output" "✅ DNSEndpoint deletion completed" } @@ -180,7 +180,7 @@ teardown() { run bash "$SCRIPT" [ "$status" -eq 0 ] - assert_contains "$output" "📝 Deleting DNSEndpoint: k-8-s-my-app-scope-123-dns in namespace test-ns" - assert_contains "$output" "âš ī¸ DNSEndpoint 'k-8-s-my-app-scope-123-dns' may already be deleted" + assert_contains "$output" "📝 Deleting DNSEndpoint: k8s-my-app-my-scope-scope-123-dns in namespace test-ns" + assert_contains "$output" "âš ī¸ DNSEndpoint 'k8s-my-app-my-scope-scope-123-dns' may already be deleted" assert_contains "$output" "✅ DNSEndpoint deletion completed" } diff --git a/k8s/scope/tests/wait_on_balancer.bats b/k8s/scope/tests/wait_on_balancer.bats index 4d111db8..83d384d4 100644 --- a/k8s/scope/tests/wait_on_balancer.bats +++ b/k8s/scope/tests/wait_on_balancer.bats @@ -22,6 +22,9 @@ setup() { "id": "scope-123", "slug": "my-scope", "domain": "my-scope.example.com" + }, + "application": { + "slug": "my-app" } }' @@ -31,11 +34,11 @@ setup() { } export -f sleep - # Mock kubectl: DNS endpoint found with status by default + # Mock kubectl: DNSEndpoint found with observedGeneration=1 by default kubectl() { case "$*" in - "get dnsendpoint k-8-s-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status}") - echo '{"observedGeneration":1}' + "get dnsendpoint k8s-my-app-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status.observedGeneration}") + echo "1" return 0 ;; *) @@ -44,29 +47,10 @@ setup() { esac } export -f kubectl - - # Mock nslookup: resolves on first attempt by default - nslookup() { - case "$1" in - "my-scope.example.com") - if [ "$2" = "8.8.8.8" ]; then - echo "Server: 8.8.8.8" - echo "Address: 8.8.8.8#53" - echo "" - echo "Name: my-scope.example.com" - echo "Address: 10.0.0.1" - return 0 - fi - ;; - esac - return 1 - } - export -f nslookup } teardown() { unset -f kubectl - unset -f nslookup unset -f sleep } @@ -79,11 +63,8 @@ teardown() { [ "$status" -eq 0 ] assert_contains "$output" "🔍 Waiting for balancer/DNS setup to complete..." assert_contains "$output" "📋 Checking ExternalDNS record creation for domain: my-scope.example.com" - assert_contains "$output" "🔍 Checking DNS resolution for my-scope.example.com (attempt 1/" - assert_contains "$output" "📋 Checking DNSEndpoint status: k-8-s-my-scope-scope-123-dns" - assert_contains "$output" "📋 DNSEndpoint status:" - assert_contains "$output" "✅ DNS record for my-scope.example.com is now resolvable" - assert_contains "$output" "✅ Domain my-scope.example.com resolves to:" + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns (attempt 1/" + assert_contains "$output" "✅ DNSEndpoint k8s-my-app-my-scope-scope-123-dns processed by ExternalDNS (observedGeneration=1)" assert_contains "$output" "✨ ExternalDNS setup completed successfully" } @@ -91,28 +72,32 @@ teardown() { # external_dns: Success after retries # ============================================================================= @test "wait_on_balancer: external_dns success after retries" { - local attempt=0 - nslookup() { - attempt=$((attempt + 1)) - if [ "$attempt" -ge 2 ] && [ "$1" = "my-scope.example.com" ] && [ "$2" = "8.8.8.8" ]; then - echo "Server: 8.8.8.8" - echo "Address: 8.8.8.8#53" - echo "" - echo "Name: my-scope.example.com" - echo "Address: 10.0.0.1" - return 0 - fi - return 1 + local call_count=0 + kubectl() { + call_count=$((call_count + 1)) + case "$*" in + "get dnsendpoint k8s-my-app-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status.observedGeneration}") + if [ "$call_count" -ge 2 ]; then + echo "1" + return 0 + fi + echo "" + return 0 + ;; + *) + return 0 + ;; + esac } - export -f nslookup + export -f kubectl run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" [ "$status" -eq 0 ] - assert_contains "$output" "🔍 Checking DNS resolution for my-scope.example.com (attempt 1/" - assert_contains "$output" "📋 DNS record not yet available, waiting 10s..." - assert_contains "$output" "🔍 Checking DNS resolution for my-scope.example.com (attempt 2/" - assert_contains "$output" "✅ DNS record for my-scope.example.com is now resolvable" + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns (attempt 1/" + assert_contains "$output" "📋 DNSEndpoint not yet processed, waiting 10s..." + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns (attempt 2/" + assert_contains "$output" "✅ DNSEndpoint k8s-my-app-my-scope-scope-123-dns processed by ExternalDNS (observedGeneration=1)" assert_contains "$output" "✨ ExternalDNS setup completed successfully" } @@ -122,15 +107,16 @@ teardown() { @test "wait_on_balancer: external_dns timeout after MAX_ITERATIONS" { export MAX_ITERATIONS=2 - nslookup() { - return 1 + kubectl() { + echo "" + return 0 } - export -f nslookup + export -f kubectl run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" [ "$status" -eq 1 ] - assert_contains "$output" "❌ DNS record creation timeout after 20s" + assert_contains "$output" "❌ DNSEndpoint processing timeout after 20s" assert_contains "$output" "💡 Possible causes:" assert_contains "$output" "ExternalDNS may still be processing the DNSEndpoint resource" assert_contains "$output" "🔧 How to fix:" @@ -139,35 +125,47 @@ teardown() { } # ============================================================================= -# external_dns: DNS endpoint not found but keeps trying +# external_dns: DNSEndpoint not found - keeps trying until timeout # ============================================================================= -@test "wait_on_balancer: external_dns DNS endpoint not found but keeps trying until resolved" { +@test "wait_on_balancer: external_dns DNS endpoint not found keeps retrying until timeout" { + export MAX_ITERATIONS=2 + kubectl() { - case "$*" in - "get dnsendpoint k-8-s-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status}") - echo "not found" - return 1 - ;; - esac + return 1 } export -f kubectl run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" - [ "$status" -eq 0 ] - assert_contains "$output" "📋 Checking DNSEndpoint status: k-8-s-my-scope-scope-123-dns" - assert_contains "$output" "✅ DNS record for my-scope.example.com is now resolvable" - assert_contains "$output" "✨ ExternalDNS setup completed successfully" + [ "$status" -eq 1 ] + assert_contains "$output" "🔍 Checking DNSEndpoint status: k8s-my-app-my-scope-scope-123-dns" + assert_contains "$output" "📋 DNSEndpoint not yet processed, waiting 10s..." + assert_contains "$output" "❌ DNSEndpoint processing timeout after 20s" } # ============================================================================= -# external_dns: DNS endpoint found with status +# external_dns: APP_SLUG truncated to 20 chars in endpoint name # ============================================================================= -@test "wait_on_balancer: external_dns DNS endpoint found with status is displayed" { +@test "wait_on_balancer: external_dns truncates APP_SLUG to 20 chars in endpoint name" { + export CONTEXT='{ + "scope": { + "id": "123", + "slug": "qa", + "domain": "qa.example.com" + }, + "application": { + "slug": "very-long-application-name-that-exceeds-limit" + } + }' + kubectl() { case "$*" in - "get dnsendpoint k-8-s-my-scope-scope-123-dns -n default-namespace -o jsonpath={.status}") - echo '{"observedGeneration":2}' + "get dnsendpoint k8s-very-long-applicatio-qa-123-dns -n default-namespace -o jsonpath={.status.observedGeneration}") + echo "1" + return 0 + ;; + *) + echo "" return 0 ;; esac @@ -177,7 +175,8 @@ teardown() { run bash "$BATS_TEST_DIRNAME/../wait_on_balancer" [ "$status" -eq 0 ] - assert_contains "$output" '📋 DNSEndpoint status: {"observedGeneration":2}' + assert_contains "$output" "k8s-very-long-applicatio-qa-123-dns" + assert_contains "$output" "✨ ExternalDNS setup completed successfully" } # ============================================================================= diff --git a/k8s/scope/wait_on_balancer b/k8s/scope/wait_on_balancer index 972f4c02..bde5cfec 100644 --- a/k8s/scope/wait_on_balancer +++ b/k8s/scope/wait_on_balancer @@ -11,6 +11,10 @@ case "$DNS_TYPE" in SCOPE_DOMAIN=$(echo "$CONTEXT" | jq -r '.scope.domain') SCOPE_SLUG=$(echo "$CONTEXT" | jq -r '.scope.slug') SCOPE_ID=$(echo "$CONTEXT" | jq -r '.scope.id') + APP_SLUG=$(echo "$CONTEXT" | jq -r '.application.slug') + APP_SLUG_SHORT="${APP_SLUG:0:20}"; APP_SLUG_SHORT="${APP_SLUG_SHORT%-}" + SCOPE_SLUG_SHORT="${SCOPE_SLUG:0:20}"; SCOPE_SLUG_SHORT="${SCOPE_SLUG_SHORT%-}" + DNS_ENDPOINT_NAME="k8s-${APP_SLUG_SHORT}-${SCOPE_SLUG_SHORT}-${SCOPE_ID}-dns" log debug "📋 Checking ExternalDNS record creation for domain: $SCOPE_DOMAIN" @@ -18,7 +22,7 @@ case "$DNS_TYPE" in iteration=$((iteration + 1)) if [ $iteration -gt $MAX_ITERATIONS ]; then log error "" - log error " ❌ DNS record creation timeout after $((MAX_ITERATIONS * 10))s" + log error " ❌ DNSEndpoint processing timeout after $((MAX_ITERATIONS * 10))s" log error "" log error "💡 Possible causes:" log error " ExternalDNS may still be processing the DNSEndpoint resource" @@ -30,27 +34,17 @@ case "$DNS_TYPE" in exit 1 fi - log debug "🔍 Checking DNS resolution for $SCOPE_DOMAIN (attempt $iteration/$MAX_ITERATIONS)" + log debug "🔍 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME (attempt $iteration/$MAX_ITERATIONS)" - DNS_ENDPOINT_NAME="k-8-s-${SCOPE_SLUG}-${SCOPE_ID}-dns" - log debug "📋 Checking DNSEndpoint status: $DNS_ENDPOINT_NAME" - - DNS_STATUS=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" -o jsonpath='{.status}' 2>/dev/null || echo "not found") - - if [ "$DNS_STATUS" != "not found" ] && [ -n "$DNS_STATUS" ]; then - log debug "📋 DNSEndpoint status: $DNS_STATUS" - fi - - if nslookup "$SCOPE_DOMAIN" 8.8.8.8 >/dev/null 2>&1; then - log info " ✅ DNS record for $SCOPE_DOMAIN is now resolvable" - - RESOLVED_IP=$(nslookup "$SCOPE_DOMAIN" 8.8.8.8 | grep -A1 "Name:" | tail -1 | awk '{print $2}' 2>/dev/null || echo "unknown") - log info " ✅ Domain $SCOPE_DOMAIN resolves to: $RESOLVED_IP" + OBSERVED_GEN=$(kubectl get dnsendpoint "$DNS_ENDPOINT_NAME" -n "$K8S_NAMESPACE" \ + -o jsonpath='{.status.observedGeneration}' 2>/dev/null || echo "") + if [ "${OBSERVED_GEN:-0}" -ge 1 ] 2>/dev/null; then + log info " ✅ DNSEndpoint $DNS_ENDPOINT_NAME processed by ExternalDNS (observedGeneration=$OBSERVED_GEN)" break fi - log debug "📋 DNS record not yet available, waiting 10s..." + log debug "📋 DNSEndpoint not yet processed, waiting 10s..." sleep 10 done diff --git a/k8s/scope/workflows/diagnose.yaml b/k8s/scope/workflows/diagnose.yaml index 66223726..45d837c3 100644 --- a/k8s/scope/workflows/diagnose.yaml +++ b/k8s/scope/workflows/diagnose.yaml @@ -34,4 +34,5 @@ steps: folders: - "$SERVICE_PATH/diagnose/service" - "$SERVICE_PATH/diagnose/scope" - - "$SERVICE_PATH/diagnose/networking" \ No newline at end of file + - "$SERVICE_PATH/diagnose/networking" + - "$SERVICE_PATH/diagnose/logs" \ No newline at end of file diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index f331df10..f2cd6507 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -5,14 +5,17 @@ "type":"object", "required":[ "ram_memory", + "ram_memory_limit", "visibility", "autoscaling", "health_check", "scaling_type", "cpu_millicores", + "cpu_millicores_limit", "fixed_instances", "scheduled_stop", "additional_ports", + "main_http_port", "protocol", "continuous_delivery" ], @@ -43,12 +46,22 @@ "elements":[ { "type":"Category", - "label":"Processor", + "label":"Resources", "elements":[ { "type":"Control", "label":"CPU Millicores", "scope":"#/properties/cpu_millicores" + }, + { + "type":"Control", + "label":"CPU Millicores Limit", + "scope":"#/properties/cpu_millicores_limit" + }, + { + "type":"Control", + "label":"RAM Memory Limit", + "scope":"#/properties/ram_memory_limit" } ] }, @@ -125,8 +138,13 @@ }, { "type":"Category", - "label":"Additional Ports", + "label":"Exposed Ports", "elements":[ + { + "type":"Control", + "label":"Main HTTP Port", + "scope":"#/properties/main_http_port" + }, { "type":"Control", "scope":"#/properties/additional_ports", @@ -350,6 +368,27 @@ "default":128, "description":"Amount of RAM memory to allocate to the container (in MB)" }, + "ram_memory_limit":{ + "type":["integer","null"], + "oneOf":[ + {"const":null, "title":"Same as request"}, + {"const":64, "title":"64 MB"}, + {"const":128, "title":"128 MB"}, + {"const":256, "title":"256 MB"}, + {"const":512, "title":"512 MB"}, + {"const":1024, "title":"1 GB"}, + {"const":2048, "title":"2 GB"}, + {"const":4096, "title":"4 GB"}, + {"const":8192, "title":"8 GB"}, + {"const":16384, "title":"16 GB"} + ], + "title":"RAM Memory Limit", + "default":null, + "minimum":{ + "$data":"1/ram_memory" + }, + "description":"Maximum memory the container can use (in MB). Pick 'Same as request' to leave it equal to the request value." + }, "visibility":{ "type":"string", "oneOf":[ @@ -484,6 +523,24 @@ "minimum":100, "description":"Amount of CPU to allocate (in millicores, 1000m = 1 CPU core)" }, + "cpu_millicores_limit":{ + "type":["integer","null"], + "oneOf":[ + {"const":null, "title":"Same as request"}, + {"const":100, "title":"100 m"}, + {"const":250, "title":"250 m"}, + {"const":500, "title":"500 m"}, + {"const":1000, "title":"1000 m"}, + {"const":2000, "title":"2000 m"}, + {"const":4000, "title":"4000 m"} + ], + "title":"CPU Millicores Limit", + "default":null, + "minimum":{ + "$data":"1/cpu_millicores" + }, + "description":"Maximum CPU the container can use (in millicores). Pick 'Same as request' to leave it equal to the request value." + }, "scheduled_stop":{ "type":"object", "title":"Scheduled Stop", @@ -537,6 +594,14 @@ "minimum":1, "description":"Fixed number of instances to run" }, + "main_http_port":{ + "type":"integer", + "title":"Main HTTP Port", + "default":8080, + "minimum":1024, + "maximum":65535, + "description":"Port where your application's main HTTP listener binds. Default 8080." + }, "additional_ports":{ "type":"array", "items":{ @@ -555,11 +620,12 @@ }, "type":{ "enum":[ - "GRPC" + "GRPC", + "HTTP" ], "type":"string", "title":"Port Type", - "default": "GRPC", + "default": "HTTP", "description":"The protocol type for this port" } } diff --git a/k8s/values.yaml b/k8s/values.yaml index 020b6059..97c68ce6 100644 --- a/k8s/values.yaml +++ b/k8s/values.yaml @@ -11,12 +11,18 @@ configuration: DNS_TYPE: route53 # Available values route53 | azure | external_dns ALB_RECONCILIATION_ENABLED: false ALB_MAX_CAPACITY: 75 + # 100 is the max target groups for ALB. Keeps 2 free for emergencies + # Ref: https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-limits.html ALB_MAX_TARGET_GROUPS: 98 + # 50 is the max listeners for ALB. Keeps 2 free for emergencies + # Ref: https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-limits.html + ALB_MAX_LISTENERS: 48 ALB_METRICS_PUBLISH_ENABLED: false # ALB_METRICS_PUBLISH_TARGET: cloudwatch # Available values: cloudwatch | datadog DEPLOYMENT_MAX_WAIT_IN_SECONDS: 600 DEPLOYMENT_TEMPLATE: "$SERVICE_PATH/deployment/templates/deployment.yaml.tpl" SECRET_TEMPLATE: "$SERVICE_PATH/deployment/templates/secret.yaml.tpl" + SECRET_FILES_TEMPLATE: "$SERVICE_PATH/deployment/templates/secret-files.yaml.tpl" SCALING_TEMPLATE: "$SERVICE_PATH/deployment/templates/scaling.yaml.tpl" SERVICE_TEMPLATE: "$SERVICE_PATH/deployment/templates/service.yaml.tpl" PDB_TEMPLATE: "$SERVICE_PATH/deployment/templates/pdb.yaml.tpl" diff --git a/scheduled_task/deployment/build_deployment b/scheduled_task/deployment/build_deployment index a39f925e..c64f5780 100644 --- a/scheduled_task/deployment/build_deployment +++ b/scheduled_task/deployment/build_deployment @@ -2,6 +2,7 @@ DEPLOYMENT_PATH="$OUTPUT_DIR/deployment-$SCOPE_ID-$DEPLOYMENT_ID.yaml" SECRET_PATH="$OUTPUT_DIR/secret-$SCOPE_ID-$DEPLOYMENT_ID.yaml" +SECRET_FILES_PATH="$OUTPUT_DIR/secret-files-$SCOPE_ID-$DEPLOYMENT_ID.yaml" CONTEXT_PATH="$OUTPUT_DIR/context-$SCOPE_ID.json" echo "$CONTEXT" | jq --arg replicas "$REPLICAS" '. + {replicas: $replicas}' > "$CONTEXT_PATH" @@ -32,4 +33,17 @@ if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then exit 1 fi +echo "Building Template: $SECRET_FILES_TEMPLATE to $SECRET_FILES_PATH" + +gomplate -c .="$CONTEXT_PATH" \ + --file "$SECRET_FILES_TEMPLATE" \ + --out "$SECRET_FILES_PATH" + +TEMPLATE_GENERATION_STATUS=$? + +if [[ $TEMPLATE_GENERATION_STATUS -ne 0 ]]; then + echo "Error building secret-files template" + exit 1 +fi + rm "$CONTEXT_PATH" diff --git a/scheduled_task/deployment/templates/deployment.yaml.tpl b/scheduled_task/deployment/templates/deployment.yaml.tpl index b5c677d6..a1d9f1f1 100644 --- a/scheduled_task/deployment/templates/deployment.yaml.tpl +++ b/scheduled_task/deployment/templates/deployment.yaml.tpl @@ -139,6 +139,16 @@ spec: envFrom: - secretRef: name: s-{{ .scope.id }}-d-{{ .deployment.id }} + {{- if .parameters.results }} + env: + {{- range .parameters.results }} + {{- if and (eq .type "file") (gt (len .values) 0) }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "app-data-%s" $key }} + value: {{ .destination_path | quote }} + {{- end }} + {{- end }} + {{- end }} image: {{ .asset.url }} resources: limits: @@ -153,9 +163,10 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-") }} - mountPath: {{ .destination_path }} - subPath: {{ filepath.Base .destination_path }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} + mountPath: {{ .destination_path | quote }} + subPath: {{ filepath.Base .destination_path | quote }} readOnly: true {{- end }} {{- end }} @@ -166,12 +177,13 @@ spec: {{- range .parameters.results }} {{- if and (eq .type "file") }} {{- if gt (len .values) 0 }} - - name: {{ printf "file-%s" (filepath.Base .destination_path | strings.ReplaceAll "." "-") }} + {{- $key := .name | strings.ToLower | regexp.Replace "[^a-z0-9]+" "-" | strings.Trim "-" }} + - name: {{ printf "file-%s" $key }} secret: - secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }} + secretName: s-{{ $.scope.id }}-d-{{ $.deployment.id }}-files items: - - key: {{ printf "app-data-%s" (filepath.Base .destination_path) }} - path: {{ filepath.Base .destination_path }} + - key: {{ printf "app-file-%s" $key }} + path: {{ filepath.Base .destination_path | quote }} {{- end }} {{- end }} {{- end }} diff --git a/scheduled_task/deployment/tests/build_deployment.bats b/scheduled_task/deployment/tests/build_deployment.bats new file mode 100644 index 00000000..aa29dd2b --- /dev/null +++ b/scheduled_task/deployment/tests/build_deployment.bats @@ -0,0 +1,166 @@ +#!/usr/bin/env bats +# ============================================================================= +# Tests for scheduled_task/deployment/build_deployment. +# +# Mirrors k8s/deployment/tests/build_deployment.bats with a scheduled_task +# context (CronJob instead of Deployment). The same file-parameter regressions +# apply because scheduled_task reuses the k8s secret templates and ships its +# own deployment template that follows the same two-Secret + sanitized-name +# pattern. +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export OUTPUT_DIR="$(mktemp -d)" + export SCOPE_ID="scope-123" + export DEPLOYMENT_ID="deploy-456" + export REPLICAS="1" + + # scheduled_task reuses the k8s secret templates and ships its own + # deployment template under scheduled_task/deployment/templates/. + export DEPLOYMENT_TEMPLATE="$PROJECT_ROOT/scheduled_task/deployment/templates/deployment.yaml.tpl" + export SECRET_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret.yaml.tpl" + export SECRET_FILES_TEMPLATE="$PROJECT_ROOT/k8s/deployment/templates/secret-files.yaml.tpl" + + export CONTEXT='{}' + + # Mock gomplate for orchestration tests (any test that doesn't `unset -f`). + gomplate() { + local out_file="" + while [[ $# -gt 0 ]]; do + case $1 in + --out) out_file="$2"; shift 2 ;; + *) shift ;; + esac + done + echo "apiVersion: v1" > "$out_file" + return 0 + } + export -f gomplate +} + +teardown() { + rm -rf "$OUTPUT_DIR" + unset -f gomplate +} + +# ============================================================================= +# File creation — confirms the script renders deployment + both Secrets +# ============================================================================= +@test "build_deployment: creates deployment file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" +} + +@test "build_deployment: creates secret file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" +} + +@test "build_deployment: creates secret-files file with correct name" { + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + assert_file_exists "$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" +} + +# ============================================================================= +# Rendering tests — real gomplate, assert on rendered output +# ============================================================================= +# Minimal context that satisfies the scheduled_task deployment template plus +# the shared k8s secret + secret-files templates. Includes a file param with +# (a) a display name that needs sanitizing and (b) a destination_path with a +# leading `[` to lock in YAML quoting at every insertion point. +_render_context() { + cat <<'JSON' +{ + "account": {"id": "acc1", "slug": "acct"}, + "namespace": {"id": "ns1", "slug": "nsps"}, + "application": {"id": "app1", "slug": "appslug"}, + "release": {"semver": "1.0.0"}, + "scope": { + "id": "scope-123", + "slug": "scopeslug", + "domain": "x.example.com", + "dimensions": {"env": "dev"}, + "capabilities": { + "cpu_millicores": 100, + "ram_memory": 128, + "cron": "*/5 * * * *", + "concurrency_policy": "Forbid", + "history_limit": {"successful": 3, "failed": 1}, + "retries": 0 + } + }, + "deployment": {"id": "deploy-456"}, + "k8s_namespace": "ns-test", + "k8s_modifiers": {}, + "asset": {"url": "example.com/app:latest"}, + "component": "app", + "service_account_name": "", + "pull_secrets": {"ENABLED": false, "SECRETS": []}, + "parameters": { + "results": [ + {"type": "environment", "variable": "MY_VAR", "values": [{"value": "hello"}]}, + {"type": "file", "name": "API P12 Cert!", "destination_path": "/app-data/[2026-05-27] cert.p12", "values": [{"value": "data:application/x-pkcs12;base64,QUFBQkJC"}]} + ] + } +} +JSON +} + +@test "build_deployment: file-type parameter splits binary into a separate Secret" { + unset -f gomplate # use the real gomplate binary + + export CONTEXT="$(_render_context)" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + local secret_file="$OUTPUT_DIR/secret-scope-123-deploy-456.yaml" + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" + local deploy_file="$OUTPUT_DIR/deployment-scope-123-deploy-456.yaml" + + assert_file_exists "$secret_file" + assert_file_exists "$secret_files_file" + assert_file_exists "$deploy_file" + + # The envFrom Secret must not carry any file-related keys, otherwise the + # binary content would be injected as an env var and runc would reject it. + ! grep -E 'app-(data|file)-' "$secret_file" + + # The files Secret holds only the binary content under a sanitized key. + assert_contains "$(cat "$secret_files_file")" "name: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$secret_files_file")" "app-file-api-p12-cert: QUFBQkJC" + ! grep -E 'app-data-' "$secret_files_file" + + # The CronJob's application container gets a plain `env:` entry whose value + # is the destination path, plus a volume mount reading from the files Secret. + assert_contains "$(cat "$deploy_file")" "- name: app-data-api-p12-cert" + # Leading `[` in the path makes YAML parse the value as a flow sequence + # unless quoted — the four insertion points below all require quoting. + assert_contains "$(cat "$deploy_file")" 'value: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'mountPath: "/app-data/[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'subPath: "[2026-05-27] cert.p12"' + assert_contains "$(cat "$deploy_file")" 'path: "[2026-05-27] cert.p12"' + + assert_contains "$(cat "$deploy_file")" "secretName: s-scope-123-d-deploy-456-files" + assert_contains "$(cat "$deploy_file")" "key: app-file-api-p12-cert" +} + +@test "build_deployment: secret-files renders empty when no file params" { + unset -f gomplate + + export CONTEXT="$(_render_context | jq '.parameters.results |= map(select(.type != "file"))')" + + run bash "$BATS_TEST_DIRNAME/../build_deployment" + [ "$status" -eq 0 ] + + # gomplate skips writing the output when the template renders empty; + # apply_templates handles missing/empty files gracefully. + local secret_files_file="$OUTPUT_DIR/secret-files-scope-123-deploy-456.yaml" + [ ! -f "$secret_files_file" ] || [ ! -s "$secret_files_file" ] +} diff --git a/scheduled_task/deployment/tests/workflow_overrides.bats b/scheduled_task/deployment/tests/workflow_overrides.bats new file mode 100644 index 00000000..591d00db --- /dev/null +++ b/scheduled_task/deployment/tests/workflow_overrides.bats @@ -0,0 +1,36 @@ +#!/usr/bin/env bats +# ============================================================================= +# Tests that verify scheduled_task deployment workflows override base k8s steps +# that do not apply to this scope type (e.g. ALB target group capacity). +# +# Contract: +# - Overlay must mark the step with `action: skip`. +# - Base workflow must still declare the step under the same name, otherwise +# a rename upstream would silently re-enable the step here. +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export OVERLAY="$PROJECT_ROOT/scheduled_task/deployment/workflows/initial.yaml" + export BASE="$PROJECT_ROOT/k8s/deployment/workflows/initial.yaml" +} + +# ============================================================================= +# validate alb target group capacity +# ============================================================================= +@test "base k8s deployment initial workflow declares 'validate alb target group capacity' step" { + run grep -A 2 "name: validate alb target group capacity" "$BASE" + + assert_equal "$status" "0" + assert_contains "$output" "type: script" + assert_contains "$output" "validate_alb_target_group_capacity" +} + +@test "scheduled_task deployment initial overlay skips 'validate alb target group capacity'" { + run grep -A 1 "name: validate alb target group capacity" "$OVERLAY" + + assert_equal "$status" "0" + assert_contains "$output" "action: skip" +} diff --git a/scheduled_task/deployment/workflows/initial.yaml b/scheduled_task/deployment/workflows/initial.yaml index 72535d20..91a87bde 100644 --- a/scheduled_task/deployment/workflows/initial.yaml +++ b/scheduled_task/deployment/workflows/initial.yaml @@ -1,6 +1,8 @@ include: - "$SERVICE_PATH/values.yaml" steps: + - name: validate alb target group capacity + action: skip - name: route traffic action: skip - name: create deployment diff --git a/scheduled_task/scope/tests/workflow_overrides.bats b/scheduled_task/scope/tests/workflow_overrides.bats new file mode 100644 index 00000000..b4dfa3c7 --- /dev/null +++ b/scheduled_task/scope/tests/workflow_overrides.bats @@ -0,0 +1,54 @@ +#!/usr/bin/env bats +# ============================================================================= +# Tests that verify scheduled_task scope workflows override base k8s steps +# that do not apply to this scope type (e.g. ALB capacity validation). +# +# Contract: +# - Overlay must mark the step with `action: skip`. +# - Base workflow must still declare the step under the same name, otherwise +# the skip is a no-op and the base step would not run anyway (or worse, a +# rename upstream would silently re-enable the step here). +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export OVERLAY="$PROJECT_ROOT/scheduled_task/scope/workflows/create.yaml" + export BASE="$PROJECT_ROOT/k8s/scope/workflows/create.yaml" +} + +# ============================================================================= +# validate alb capacity +# ============================================================================= +@test "base k8s scope create workflow declares 'validate alb capacity' step" { + run grep -A 2 "name: validate alb capacity" "$BASE" + + assert_equal "$status" "0" + assert_contains "$output" "type: script" + assert_contains "$output" "validate_alb_capacity" +} + +@test "scheduled_task scope create overlay skips 'validate alb capacity'" { + run grep -A 1 "name: validate alb capacity" "$OVERLAY" + + assert_equal "$status" "0" + assert_contains "$output" "action: skip" +} + +# ============================================================================= +# networking (scheduled_task has no public traffic, so the whole block is skipped) +# ============================================================================= +@test "base k8s scope create workflow declares 'networking' step" { + run grep -A 1 "name: networking" "$BASE" + + assert_equal "$status" "0" + assert_contains "$output" "type: workflow" +} + +@test "scheduled_task scope create overlay skips 'networking'" { + run grep -A 1 "name: networking" "$OVERLAY" + + assert_equal "$status" "0" + assert_contains "$output" "action: skip" +} diff --git a/scheduled_task/scope/workflows/create.yaml b/scheduled_task/scope/workflows/create.yaml index 47156c7d..26b7820f 100644 --- a/scheduled_task/scope/workflows/create.yaml +++ b/scheduled_task/scope/workflows/create.yaml @@ -2,4 +2,6 @@ include: - "$SERVICE_PATH/values.yaml" steps: - name: networking + action: skip + - name: validate alb capacity action: skip \ No newline at end of file diff --git a/service/deployment/entrypoint b/service/deployment/entrypoint index 7d7f9d97..9a1ee9b1 100755 --- a/service/deployment/entrypoint +++ b/service/deployment/entrypoint @@ -37,8 +37,19 @@ esac WORKFLOW_PATH="$SERVICE_PATH/deployment/workflows/$ACTION_TO_EXECUTE.yaml" +NEEDS_PARAMS=true +case "$SERVICE_ACTION" in + "switch-traffic"|"kill-instances"|"diagnose-deployment") + NEEDS_PARAMS=false + ;; +esac + CMD="np service workflow exec --workflow $WORKFLOW_PATH --build-context --include-secrets" +if [ "$NEEDS_PARAMS" = "false" ] && np service workflow exec --help 2>&1 | grep -q "\-\-no-params"; then + CMD="$CMD --no-params" +fi + IFS=',' read -ra OVERRIDE_PATHS <<< "$OVERRIDES_PATH" for path in "${OVERRIDE_PATHS[@]}"; do # Trim whitespace diff --git a/service/deployment/tests/entrypoint.bats b/service/deployment/tests/entrypoint.bats new file mode 100644 index 00000000..fac10a8f --- /dev/null +++ b/service/deployment/tests/entrypoint.bats @@ -0,0 +1,185 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for service/deployment/entrypoint - --no-params flag behavior +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export SCOPE_ID="scope-123" + export DEPLOYMENT_ID="deploy-456" + export SERVICE_PATH="/tmp/test-service-path" + export OVERRIDES_PATH="" + + mkdir -p "$SERVICE_PATH/deployment/workflows" + touch "$SERVICE_PATH/deployment/workflows/initial.yaml" + touch "$SERVICE_PATH/deployment/workflows/blue_green.yaml" + touch "$SERVICE_PATH/deployment/workflows/switch_traffic.yaml" + touch "$SERVICE_PATH/deployment/workflows/rollback.yaml" + touch "$SERVICE_PATH/deployment/workflows/finalize.yaml" + touch "$SERVICE_PATH/deployment/workflows/delete.yaml" + touch "$SERVICE_PATH/deployment/workflows/diagnose.yaml" + touch "$SERVICE_PATH/deployment/workflows/kill_instances.yaml" + + export NP_EXECUTED_CMD="" + export NP_HELP_SUPPORTS_NO_PARAMS="true" +} + +teardown() { + rm -rf "$SERVICE_PATH" + unset -f np +} + +mock_np() { + np() { + if [[ "$*" == *"--help"* ]]; then + if [ "$NP_HELP_SUPPORTS_NO_PARAMS" = "true" ]; then + echo " --no-params Skip parameter fetching" + fi + return 0 + fi + export NP_EXECUTED_CMD="np $*" + return 0 + } + export -f np +} + +# ============================================================================= +# Actions that SHOULD include --no-params (when CLI supports it) +# ============================================================================= + +@test "deployment entrypoint: switch-traffic includes --no-params when CLI supports it" { + mock_np + export SERVICE_ACTION="switch-traffic" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "deployment entrypoint: kill-instances includes --no-params when CLI supports it" { + mock_np + export SERVICE_ACTION="kill-instances" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "deployment entrypoint: diagnose-deployment includes --no-params when CLI supports it" { + mock_np + export SERVICE_ACTION="diagnose-deployment" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +# ============================================================================= +# Actions that SHOULD NOT include --no-params +# ============================================================================= + +@test "deployment entrypoint: start-initial does NOT include --no-params" { + mock_np + export SERVICE_ACTION="start-initial" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: start-blue-green does NOT include --no-params" { + mock_np + export SERVICE_ACTION="start-blue-green" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: rollback-deployment does NOT include --no-params" { + mock_np + export SERVICE_ACTION="rollback-deployment" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: finalize-blue-green does NOT include --no-params" { + mock_np + export SERVICE_ACTION="finalize-blue-green" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: delete-deployment does NOT include --no-params" { + mock_np + export SERVICE_ACTION="delete-deployment" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +# ============================================================================= +# Backward compatibility - old CLI without --no-params support +# ============================================================================= + +@test "deployment entrypoint: switch-traffic omits --no-params when CLI does not support it" { + export NP_HELP_SUPPORTS_NO_PARAMS="false" + mock_np + export SERVICE_ACTION="switch-traffic" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +@test "deployment entrypoint: kill-instances omits --no-params when CLI does not support it" { + export NP_HELP_SUPPORTS_NO_PARAMS="false" + mock_np + export SERVICE_ACTION="kill-instances" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +# ============================================================================= +# Edge cases +# ============================================================================= + +@test "deployment entrypoint: unknown action fails" { + mock_np + export SERVICE_ACTION="unknown-action" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 1 ] + assert_contains "$output" "Unknown action" +} + +@test "deployment entrypoint: --build-context and --include-secrets always present" { + mock_np + export SERVICE_ACTION="switch-traffic" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--build-context" + assert_contains "$output" "--include-secrets" +} diff --git a/service/scope/entrypoint b/service/scope/entrypoint index 721b1188..12b215a1 100755 --- a/service/scope/entrypoint +++ b/service/scope/entrypoint @@ -47,6 +47,10 @@ fi CMD="np service workflow exec --workflow $WORKFLOW_PATH --build-context --include-secrets" +if np service workflow exec --help 2>&1 | grep -q "\-\-no-params"; then + CMD="$CMD --no-params" +fi + IFS=',' read -ra OVERRIDE_PATHS <<< "$OVERRIDES_PATH" for path in "${OVERRIDE_PATHS[@]}"; do # Trim whitespace diff --git a/service/scope/tests/entrypoint.bats b/service/scope/tests/entrypoint.bats new file mode 100644 index 00000000..afcc5148 --- /dev/null +++ b/service/scope/tests/entrypoint.bats @@ -0,0 +1,135 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for service/scope/entrypoint - --no-params flag behavior +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + + export SCOPE_ID="scope-123" + export SERVICE_PATH="/tmp/test-service-path" + export OVERRIDES_PATH="" + + mkdir -p "$SERVICE_PATH/scope/workflows" + touch "$SERVICE_PATH/scope/workflows/create.yaml" + touch "$SERVICE_PATH/scope/workflows/update.yaml" + touch "$SERVICE_PATH/scope/workflows/delete.yaml" + touch "$SERVICE_PATH/scope/workflows/diagnose.yaml" + touch "$SERVICE_PATH/scope/workflows/restart-pods.yaml" + touch "$SERVICE_PATH/scope/workflows/pause-autoscaling.yaml" + touch "$SERVICE_PATH/scope/workflows/resume-autoscaling.yaml" + touch "$SERVICE_PATH/scope/workflows/set-desired-instance-count.yaml" + + export NP_HELP_SUPPORTS_NO_PARAMS="true" +} + +teardown() { + rm -rf "$SERVICE_PATH" + unset -f np +} + +mock_np() { + np() { + if [[ "$*" == *"--help"* ]]; then + if [ "$NP_HELP_SUPPORTS_NO_PARAMS" = "true" ]; then + echo " --no-params Skip parameter fetching" + fi + return 0 + fi + export NP_EXECUTED_CMD="np $*" + return 0 + } + export -f np +} + +# ============================================================================= +# All scope actions SHOULD include --no-params (when CLI supports it) +# ============================================================================= + +@test "scope entrypoint: create includes --no-params" { + mock_np + export SERVICE_ACTION="create-scope" + export SERVICE_ACTION_TYPE="create" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: update includes --no-params" { + mock_np + export SERVICE_ACTION="update-scope" + export SERVICE_ACTION_TYPE="update" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: delete includes --no-params" { + mock_np + export SERVICE_ACTION="delete-scope" + export SERVICE_ACTION_TYPE="custom" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: diagnose includes --no-params" { + mock_np + export SERVICE_ACTION="diagnose-scope" + export SERVICE_ACTION_TYPE="custom" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +@test "scope entrypoint: restart-pods includes --no-params" { + mock_np + export SERVICE_ACTION="restart-pods" + export SERVICE_ACTION_TYPE="custom" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--no-params" +} + +# ============================================================================= +# Backward compatibility - old CLI without --no-params support +# ============================================================================= + +@test "scope entrypoint: omits --no-params when CLI does not support it" { + export NP_HELP_SUPPORTS_NO_PARAMS="false" + mock_np + export SERVICE_ACTION="create-scope" + export SERVICE_ACTION_TYPE="create" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + [[ "$output" != *"--no-params"* ]] +} + +# ============================================================================= +# Core flags always present +# ============================================================================= + +@test "scope entrypoint: --build-context and --include-secrets always present" { + mock_np + export SERVICE_ACTION="create-scope" + export SERVICE_ACTION_TYPE="create" + + run bash "$BATS_TEST_DIRNAME/../entrypoint" + + [ "$status" -eq 0 ] + assert_contains "$output" "--build-context" + assert_contains "$output" "--include-secrets" +}