diff --git a/codebundles/k8s-applog-health/.runwhen/generation-rules/k8s-applog-health.yaml b/codebundles/k8s-applog-health/.runwhen/generation-rules/k8s-applog-health.yaml new file mode 100644 index 000000000..250026612 --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/generation-rules/k8s-applog-health.yaml @@ -0,0 +1,21 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + - resourceTypes: + - deployment + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + slxs: + - baseName: applog-health + levelOfDetail: detailed + qualifiers: ["resource", "namespace", "cluster"] + baseTemplateName: k8s-applog-health + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: k8s-applog-health-taskset.yaml diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml new file mode 100755 index 000000000..34fa0d3f5 --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml @@ -0,0 +1,65 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} + runwhen.com/sli: "true" +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{ default_location }} + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-applog-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 600 + description: Measures the health of the application logs for the {{match_resource.resource.metadata.name}} {{match_resource.kind | lower}}. + configProvided: + - name: NAMESPACE + value: {{match_resource.resource.metadata.namespace}} + - name: CONTEXT + value: {{context}} + - name: KUBERNETES_DISTRIBUTION_BINARY + value: {{custom.kubernetes_distribution_binary | default("kubectl")}} + - name: WORKLOAD_NAME + value: {{match_resource.resource.metadata.name}} + - name: WORKLOAD_TYPE + value: {{match_resource.kind | lower}} + - name: CONTAINER_RESTART_AGE + value: "10m" + - name: CONTAINER_RESTART_THRESHOLD + value: "2" + - name: EVENT_AGE + value: "10m" + - name: EVENT_THRESHOLD + value: "2" + - name: CHECK_SERVICE_ENDPOINTS + value: "true" + - name: MAX_LOG_LINES + value: "1000" + - name: MAX_LOG_BYTES + value: "2097152" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name}} + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m \ No newline at end of file diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml new file mode 100644 index 000000000..a9ca8968c --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml @@ -0,0 +1,25 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/deploy.svg + alias: {{match_resource.resource.metadata.name}} {{match_resource.kind}} Application Log Health + asMeasuredBy: The presence of application-level errors/issues/stacktraces in the application logs indicating runtime errors or exceptions in {{match_resource.resource.metadata.name}}. + configProvided: + - name: OBJECT_NAME + value: {{match_resource.resource.metadata.name}} + owners: + - {{workspace.owner_email}} + statement: Application logs for {{match_resource.resource.metadata.name}} {{match_resource.kind | lower}} should be free of critical errors/issues/stacktraces indicating runtime errors or exceptions. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: access + value: read-only \ No newline at end of file diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml new file mode 100644 index 000000000..e67d520e4 --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml @@ -0,0 +1,50 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-applog-health/runbook.robot + configProvided: + - name: NAMESPACE + value: {{match_resource.resource.metadata.namespace}} + - name: CONTEXT + value: {{context}} + - name: KUBERNETES_DISTRIBUTION_BINARY + value: {{custom.kubernetes_distribution_binary}} + - name: WORKLOAD_NAME + value: {{match_resource.resource.metadata.name}} + - name: WORKLOAD_TYPE + value: {{match_resource.kind | lower}} + - name: CONTAINER_RESTART_AGE + value: "30m" + - name: CONTAINER_RESTART_THRESHOLD + value: "4" + - name: LOG_AGE + value: "10m" + - name: LOG_SIZE + value: "2097152" + - name: LOG_LINES + value: "1000" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name}} + {% endif %} \ No newline at end of file diff --git a/codebundles/k8s-applog-health/README.md b/codebundles/k8s-applog-health/README.md new file mode 100644 index 000000000..5bbe68a13 --- /dev/null +++ b/codebundles/k8s-applog-health/README.md @@ -0,0 +1,56 @@ +# Kubernetes Application Log Health + +This codebundle provides tasks for triaging application log health of Kubernetes workloads (deployments, statefulsets, or daemonsets). It fetches pod logs, scans for error patterns, and reports issues with severity and next steps. + +## Tasks + +**Runbook** +- `Analyze Application Log Patterns for ${WORKLOAD_TYPE} ${WORKLOAD_NAME} in Namespace ${NAMESPACE}` — Fetches workload logs, scans for configurable error/exception patterns, creates issues for matches above the severity threshold, and reports a log health score and summary. +- `Fetch Workload Logs for ${WORKLOAD_TYPE} ${WORKLOAD_NAME} in Namespace ${NAMESPACE}` — Fetches and attaches workload logs to the report for manual review (no issue creation). + +**SLI** +- `Get Critical Log Errors and Score for ${WORKLOAD_TYPE} ${WORKLOAD_NAME}` — Fetches logs and scores health based on critical error patterns (e.g. GenericError, AppFailure) and container restarts; pushes a metric for SLI scoring. +- `Generate Application Health Score for ${WORKLOAD_TYPE} ${WORKLOAD_NAME}` — Computes the final applog health score and report details (e.g. scaled-to-zero vs healthy vs issues). + +### Log pattern categories + +Analysis uses pattern categories (configurable via `runbook_patterns.json` or `sli_critical_patterns.json`). Examples: + +- **GenericError** — exception, fatal, panic, crash, failed, failure (severity 1) +- **AppFailure** — application failed, service unavailable, connection refused, timeout, OOM, disk full, auth failures (severity 1) +- **StackTrace** — stack trace, exception in thread, java.lang., traceback, panic (severity 1) +- **Connection** — connection reset/timeout, network unreachable, socket error, DNS resolution failed (severity 2) +- **Timeout** — request/operation timeout, deadline exceeded, read/write timeout (severity 2) +- **Auth** — unauthorized, authentication error, invalid credentials, forbidden, token expired (severity 2) +- **Exceptions** — NullPointerException, IllegalArgumentException, SQLException, IOException, etc. (severity 2) +- **Resource** — resource exhausted, memory leak, CPU throttled, quota/rate limit exceeded (severity 2) +- **HealthyRecovery** — recovered from error, connection restored, retry successful (severity 4, informational) + +Exclude patterns (e.g. INFO/DEBUG/TRACE, health checks, heartbeats) reduce false positives. + +## Configuration + +The TaskSet/SLI requires initialization with secrets and user variables. Key variables: + +- `kubeconfig` — Secret containing cluster access (kubeconfig YAML). +- `KUBERNETES_DISTRIBUTION_BINARY` — CLI binary for Kubernetes (`kubectl` or `oc`). Default: `kubectl`. +- `CONTEXT` — Kubernetes context to use. +- `NAMESPACE` — Namespace of the workload. Leave blank to search all namespaces. +- `WORKLOAD_NAME` — Name of the deployment, statefulset, or daemonset to analyze. +- `WORKLOAD_TYPE` — Type of workload: `deployment`, `statefulset`, or `daemonset`. Default: `deployment`. +- `LOG_AGE` — Age of logs to fetch (e.g. `10m`). Default: `10m`. +- `LOG_LINES` / `LOG_SIZE` — Max lines or bytes per container for runbook log fetch. Defaults: 1000 lines, 2MB. +- `LOG_SEVERITY_THRESHOLD` — Minimum severity to create issues (1=critical … 5=info). Default: 3. +- `LOG_PATTERN_CATEGORIES` — Comma-separated categories to scan (e.g. `GenericError,AppFailure,Connection`). Default includes GenericError, AppFailure, Connection, Timeout, Auth, Exceptions, Resource, HealthyRecovery. +- `LOGS_EXCLUDE_PATTERN` — Regex to exclude lines from analysis (e.g. INFO/DEBUG, health checks). +- `EXCLUDED_CONTAINER_NAMES` — Comma-separated container names to skip (e.g. `linkerd-proxy,istio-proxy`). Default: `linkerd-proxy,istio-proxy,vault-agent`. +- `CONTAINER_RESTART_AGE` / `CONTAINER_RESTART_THRESHOLD` — Time window and threshold for container restarts (SLI). Defaults: e.g. `10m`, `1`. +- `LOG_SCAN_TIMEOUT` — Timeout in seconds for log scanning. Default: 300. + +## Requirements + +- A kubeconfig with RBAC permissions to list pods and read logs for the target workload and namespace. + +## TODO + +- [ ] Add additional documentation. diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot new file mode 100755 index 000000000..dfd340c09 --- /dev/null +++ b/codebundles/k8s-applog-health/runbook.robot @@ -0,0 +1,476 @@ +*** Settings *** +Documentation Triages issues related to a deployment and its replicas. +Metadata Author akshayrw25 +Metadata Display Name Kubernetes AppLog Analysis +Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library RW.NextSteps +Library RW.K8sHelper +Library RW.K8sLog +Library RW.LogAnalysis.ExtractTraceback + +Library OperatingSystem +Library String +Library Collections +Library DateTime + +Suite Setup Suite Initialization + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Which binary to use for Kubernetes CLI commands. + ... pattern=\w* + ... enum=[kubectl,oc] + ... example=kubectl + ... default=kubectl + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Which Kubernetes context to operate within. + ... pattern=\w* + ... example=my-main-cluster + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=The name of the Kubernetes namespace to scope actions and searching to. + ... pattern=\w* + ... example=otel-demo + ${WORKLOAD_NAME}= RW.Core.Import User Variable WORKLOAD_NAME + ... type=string + ... description=The name of the workload (deployment, statefulset, or daemonset) to analyze for application logs. + ... pattern=\w* + ... example=otel-demo-frontend + ${WORKLOAD_TYPE}= RW.Core.Import User Variable WORKLOAD_TYPE + ... type=string + ... description=The type of Kubernetes workload to analyze. + ... pattern=\w* + ... enum=[deployment,statefulset,daemonset] + ... example=deployment + ... default=deployment + ${LOG_LINES}= RW.Core.Import User Variable LOG_LINES + ... type=string + ... description=The number of log lines to fetch from the pods when inspecting logs. + ... pattern=\d+ + ... example=100 + ... default=1000 + ${LOG_AGE}= RW.Core.Import User Variable LOG_AGE + ... type=string + ... description=The age of logs to fetch from pods, used for log analysis tasks. + ... pattern=\w* + ... example=10m + ... default=10m + ${LOG_SIZE}= RW.Core.Import User Variable LOG_SIZE + ... type=string + ... description=The maximum size of logs in bytes to fetch from pods, used for log analysis tasks. Defaults to 2MB. + ... pattern=\d* + ... example=1024 + ... default=2097152 + + ${LOG_ANALYSIS_DEPTH}= RW.Core.Import User Variable LOG_ANALYSIS_DEPTH + ... type=string + ... description=The depth of log analysis to perform - basic, standard, or comprehensive. + ... pattern=\w* + ... enum=[basic,standard,comprehensive] + ... example=standard + ... default=standard + ${LOG_SEVERITY_THRESHOLD}= RW.Core.Import User Variable LOG_SEVERITY_THRESHOLD + ... type=string + ... description=The minimum severity level for creating issues (1=critical, 2=high, 3=medium, 4=low, 5=info). + ... pattern=\d+ + ... example=3 + ... default=3 + ${LOG_PATTERN_CATEGORIES_STR}= RW.Core.Import User Variable LOG_PATTERN_CATEGORIES + ... type=string + ... description=Comma-separated list of log pattern categories to scan for. + ... pattern=.* + ... example=GenericError,AppFailure,Connection + ... default=GenericError,AppFailure,Connection,Timeout,Auth,Exceptions,Resource,HealthyRecovery + ${ANOMALY_THRESHOLD}= RW.Core.Import User Variable ANOMALY_THRESHOLD + ... type=string + ... description=The threshold for detecting event anomalies based on events per minute. + ... pattern=\d+ + ... example=5 + ... default=5 + ${LOGS_ERROR_PATTERN}= RW.Core.Import User Variable LOGS_ERROR_PATTERN + ... type=string + ... description=The error pattern to use when grep-ing logs. + ... pattern=\w* + ... example=(Error: 13|Error: 14) + ... default=error|ERROR + ${LOGS_EXCLUDE_PATTERN}= RW.Core.Import User Variable LOGS_EXCLUDE_PATTERN + ... type=string + ... description=Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. + ... pattern=.* + ... example="errors":\\s*\\[\\]|"warnings":\\s*\\[\\] + ... default="errors":\\\\s*\\\\[\\\\]|\\\\bINFO\\\\b|\\\\bDEBUG\\\\b|\\\\bTRACE\\\\b|\\\\bSTART\\\\s*-\\\\s*|\\\\bSTART\\\\s*method\\\\b + ${LOG_SCAN_TIMEOUT}= RW.Core.Import User Variable LOG_SCAN_TIMEOUT + ... type=string + ... description=Timeout in seconds for log scanning operations. Increase this value if log scanning times out on large log files. + ... pattern=\d+ + ... example=300 + ... default=300 + ${EXCLUDED_CONTAINER_NAMES}= RW.Core.Import User Variable EXCLUDED_CONTAINER_NAMES + ... type=string + ... description=Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). + ... pattern=.* + ... example=linkerd-proxy,istio-proxy,vault-agent + ... default=linkerd-proxy,istio-proxy,vault-agent + + ${CONTAINER_RESTART_AGE}= RW.Core.Import User Variable CONTAINER_RESTART_AGE + ... type=string + ... description=The time window (in (h) hours or (m) minutes) to search for container restarts. Only containers that restarted within this time period will be reported. + ... pattern=\w* + ... example=10m + ... default=10m + ${CONTAINER_RESTART_THRESHOLD}= RW.Core.Import User Variable CONTAINER_RESTART_THRESHOLD + ... type=string + ... description=The minimum number of restarts required to trigger an issue. Containers with restart counts below this threshold will be ignored. + ... pattern=\d+ + ... example=1 + ... default=1 + # Convert comma-separated strings to lists + @{LOG_PATTERN_CATEGORIES}= Split String ${LOG_PATTERN_CATEGORIES_STR} , + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END + + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTEXT} + Set Suite Variable ${NAMESPACE} + Set Suite Variable ${WORKLOAD_NAME} + Set Suite Variable ${WORKLOAD_TYPE} + Set Suite Variable ${LOG_LINES} + Set Suite Variable ${LOG_AGE} + Set Suite Variable ${LOG_SIZE} + + Set Suite Variable ${LOG_ANALYSIS_DEPTH} + Set Suite Variable ${LOG_SEVERITY_THRESHOLD} + Set Suite Variable ${LOG_PATTERN_CATEGORIES_STR} + Set Suite Variable @{LOG_PATTERN_CATEGORIES} + Set Suite Variable ${ANOMALY_THRESHOLD} + Set Suite Variable ${LOGS_ERROR_PATTERN} + Set Suite Variable ${LOGS_EXCLUDE_PATTERN} + Set Suite Variable ${LOG_SCAN_TIMEOUT} + Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} + Set Suite Variable @{EXCLUDED_CONTAINERS} + + Set Suite Variable ${CONTAINER_RESTART_AGE} + Set Suite Variable ${CONTAINER_RESTART_THRESHOLD} + # Construct environment dictionary safely to handle special characters in regex patterns + &{env_dict}= Create Dictionary + ... KUBECONFIG=${kubeconfig.key} + ... KUBERNETES_DISTRIBUTION_BINARY=${KUBERNETES_DISTRIBUTION_BINARY} + ... CONTEXT=${CONTEXT} + ... NAMESPACE=${NAMESPACE} + ... LOGS_ERROR_PATTERN=${LOGS_ERROR_PATTERN} + ... LOGS_EXCLUDE_PATTERN=${LOGS_EXCLUDE_PATTERN} + ... ANOMALY_THRESHOLD=${ANOMALY_THRESHOLD} + ... WORKLOAD_NAME=${WORKLOAD_NAME} + ... WORKLOAD_TYPE=${WORKLOAD_TYPE} + ... CONTAINER_RESTART_AGE=${CONTAINER_RESTART_AGE} + ... CONTAINER_RESTART_THRESHOLD=${CONTAINER_RESTART_THRESHOLD} + ... LOG_SCAN_TIMEOUT=${LOG_SCAN_TIMEOUT} + Set Suite Variable ${env} ${env_dict} + + # Check if workload is scaled to 0 and handle appropriately + # Different workload types have different field structures + + IF '${WORKLOAD_TYPE}' == 'daemonset' + # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + ELSE + IF '${WORKLOAD_TYPE}' == 'statefulset' + # StatefulSet: use current/updated replicas in addition to spec/ready + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), current_replicas: (.status.currentReplicas // 0), updated_replicas: (.status.updatedReplicas // 0)}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + ELSE + # For deployments + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + END + + TRY + ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json + ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) + + # Try to determine when deployment was scaled down by checking recent events and replica set history + ${scale_down_info}= Get Deployment Scale Down Timestamp ${spec_replicas} + + IF ${spec_replicas} == 0 + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas - returning special health score + Log Scale down detected at: ${scale_down_info} + + # For scaled-down workloads, return a score of 1.0 to indicate "intentionally down" vs "broken" + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} + Set Suite Variable ${SCALED_DOWN_INFO} ${scale_down_info} + ELSE + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + + EXCEPT + Log Warning: Failed to check workload scale, continuing with normal log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + END + + +*** Tasks *** + + +Analyze Application Log Patterns for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. + [Tags] + ... logs + ... application + ... errors + ... patterns + ... health + ... ${WORKLOAD_TYPE} + ... access:read-only + # Skip pod-related checks if deployment is scaled to 0 + IF not ${SKIP_HEALTH_CHECKS} + # Temporarily suppress log warnings for excluded containers (they're expected) + TRY + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${LOG_AGE} + ... excluded_containers=${EXCLUDED_CONTAINERS} + EXCEPT AS ${log_error} + # If log fetching fails completely, log the error but continue + Log Warning: Log fetching encountered an error: ${log_error} + + # TODO: remove this after testing + RW.Core.Add Pre To Report **Log Fetching Error:** ${log_error} + # Set empty log directory to continue with other checks + ${log_dir}= Set Variable ${EMPTY} + END + + # Only scan logs if we have a valid log directory + IF '''${log_dir}''' != '''${EMPTY}''' + ${scan_results}= RW.K8sLog.Scan Logs For Issues + ... log_dir=${log_dir} + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... categories=@{LOG_PATTERN_CATEGORIES} + ... custom_patterns_file=runbook_patterns.json + ... excluded_containers=${EXCLUDED_CONTAINERS} + ELSE + # Create empty scan results if no logs were fetched + ${scan_results}= Evaluate {"issues": [], "summary": ["No logs available for analysis"]} + END + + # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN + TRY + IF $LOGS_EXCLUDE_PATTERN != "" + ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re + ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} + Set Test Variable ${scan_results} ${filtered_results} + END + EXCEPT + Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results + END + + ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} + + # Process each issue found in the logs + ${issues}= Evaluate $scan_results.get('issues', []) + FOR ${issue} IN @{issues} + ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) + IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} + # Convert issue details to string to avoid serialization issues + ${issue_details_raw}= Evaluate $issue.get("details", "") + ${issue_details_str}= Convert To String ${issue_details_raw} + ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue_details_str} + + # Safely extract title and next_steps as strings + ${issue_title_raw}= Evaluate $issue.get('title', 'Log pattern issue detected') + ${issue_title}= Convert To String ${issue_title_raw} + ${next_steps_raw}= Evaluate $issue.get('next_steps', 'Review application logs and resolve underlying issues') + ${next_steps}= Convert To String ${next_steps_raw} + + # Use timestamp from log scan results if available, otherwise extract from details + ${issue_timestamp}= Evaluate $issue.get('observed_at', '') + + RW.Core.Add Issue + ... severity=${severity} + ... expected=Application logs should be free of critical errors for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... actual=${issue_title} in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... title=${issue_title} in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + ... reproduce_hint=Check application logs for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... details=${summarized_details} + ... next_steps=${next_steps} + ... observed_at=${issue_timestamp} + ... next_action=analyzeApplog + END + END + + ${issues_count}= Get Length ${issues} + + # Convert scan_results to string to avoid serialization issues, then format for display + ${scan_results_str}= Evaluate json.dumps($scan_results, indent=2) json + ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results_str} + + RW.Core.Add Pre To Report **Log Analysis Summary for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` (Last ${LOG_LINES} lines, ${LOG_AGE} age) **\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} + + RW.K8sLog.Cleanup Temp Files + END + +Fetch Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Fetches and displays workload logs in the report for manual review. Note: Issues are not created by this task - see "Analyze Application Log Patterns" for automated issue detection. + [Tags] + ... logs + ... collection + ... ${WORKLOAD_TYPE} + ... troubleshooting + ... access:read-only + # Skip pod-related checks if deployment is scaled to 0 + IF not ${SKIP_HEALTH_CHECKS} + # Fetch raw logs + ${workload_logs}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} logs ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... show_in_rwl_cheatsheet=true + ... render_in_commandlist=true + + IF ${workload_logs.returncode} == 0 + # Filter logs to remove repetitive health check messages and focus on meaningful content + ${filtered_logs}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | grep -E "(error|ERROR|warn|WARN|exception|Exception|fail|FAIL|fatal|FATAL|panic|stack|trace|timeout|connection.*refused|unable.*connect|authentication.*failed|denied|forbidden|unauthorized|500|502|503|504)" | tail -50 || echo "No significant errors or warnings found in recent logs" + ... env=${env} + ... include_in_history=false + + # Also get a sample of non-health-check logs for context + ${context_logs}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | head -20 | tail -10 + ... env=${env} + ... include_in_history=false + + ${history}= RW.CLI.Pop Shell History + + # Determine if logs are mostly health checks + ${total_lines}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | wc -l + ... env=${env} + ... include_in_history=false + + ${health_check_lines}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | grep -E "(Checking.*Health|Health.*Check|healthcheck|/health)" | wc -l + ... env=${env} + ... include_in_history=false + + # Handle empty output from wc -l by providing default values + ${total_lines_clean}= Set Variable If "${total_lines.stdout.strip()}" == "" 0 ${total_lines.stdout.strip()} + ${health_check_lines_clean}= Set Variable If "${health_check_lines.stdout.strip()}" == "" 0 ${health_check_lines.stdout.strip()} + + ${total_count}= Convert To Integer ${total_lines_clean} + ${health_count}= Convert To Integer ${health_check_lines_clean} + + # Create consolidated logs report + IF ${health_count} > ${total_count} * 0.8 + ${log_content}= Set Variable If "${context_logs.stdout.strip()}" != "" **🔍 Filtered Error/Warning Logs:**\n${filtered_logs.stdout}\n\n**📝 Sample Application Logs (Non-Health Check):**\n${context_logs.stdout} **🔍 Filtered Error/Warning Logs:**\n${filtered_logs.stdout} + RW.Core.Add Pre To Report **📋 Raw Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n**ℹ️ Logs are mostly health check messages (${health_count}/${total_count} lines)**\n\n${log_content}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. + ELSE + RW.Core.Add Pre To Report **📋 Raw Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n\n**📝 Recent Application Logs:**\n${workload_logs.stdout}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. + END + ELSE + # Only add to report if fetch failed, don't create issue + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report **📋 Raw Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}`**\n\n⚠️ Unable to fetch workload logs (exit code ${workload_logs.returncode}).\n\n**STDERR:** ${workload_logs.stderr}\n\n**Commands Used:** ${history} + END + END + + +Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Collects and analyzes stacktraces/tracebacks from all pods in the workload for troubleshooting application issues. + [Tags] + ... logs + ... stacktraces + ... tracebacks + ... ${WORKLOAD_TYPE} + ... troubleshooting + ... errors + ... access:read-only + # Skip pod-related checks if workload is scaled to 0 + IF not ${SKIP_HEALTH_CHECKS} + # Convert comma-separated string to list for excluded containers + @{EXCLUDED_CONTAINERS}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + + # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${LOG_AGE} + ... max_log_lines=${LOG_LINES} + ... max_log_bytes=${LOG_SIZE} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Extract stacktraces from the log directory using the traceback library + ${tracebacks}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks + ... logs_dir=${log_dir} + + # Check total number of tracebacks extracted + ${total_tracebacks}= Get Length ${tracebacks} + + IF ${total_tracebacks} == 0 + # No tracebacks found + RW.Core.Add Pre To Report **📋 No Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\nLog analysis completed successfully with no stacktraces detected. + ELSE + # Stacktraces found - create issues for each one + ${delimiter}= Evaluate '-' * 80 + + FOR ${traceback} IN @{tracebacks} + ${stacktrace}= Set Variable ${traceback["stacktrace"]} + ${timestamp}= Set Variable ${traceback["timestamp"]} + RW.Core.Add Issue + ... severity=2 + ... expected=No stacktraces should be present in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` + ... actual=Stacktrace detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` + ... title=Stacktrace Detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + ... reproduce_hint=Check application logs for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... details=${delimiter}\n${stacktrace}\n${delimiter} + ... next_steps=Review application logs for the root cause of the stacktrace\nCheck application configuration and resource limits\nInvestigate the specific error conditions that led to this stacktrace\nConsider scaling or restarting the ${WORKLOAD_TYPE} if issues persist\nMonitor application health and performance metrics + ... next_action=analyseStacktrace + ... observed_at=${timestamp} + END + + # Create consolidated report showing all stacktraces + ${stacktrace_strings}= Evaluate [tb["stacktrace"] for tb in ${tracebacks}] + ${agg_tracebacks}= Evaluate "\\n" + "\\n${delimiter}\\n".join(${stacktrace_strings}) + RW.Core.Add Pre To Report **🔍 Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Total Stacktraces:** ${total_tracebacks}\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\n${agg_tracebacks} + END + + # Clean up temporary log files + RW.K8sLog.Cleanup Temp Files + END diff --git a/codebundles/k8s-applog-health/runbook_patterns.json b/codebundles/k8s-applog-health/runbook_patterns.json new file mode 100644 index 000000000..12c9ac4d4 --- /dev/null +++ b/codebundles/k8s-applog-health/runbook_patterns.json @@ -0,0 +1,153 @@ +{ + "critical_patterns": { + "GenericError": { + "description": "Generic error patterns that indicate application failures", + "patterns": [ + "exception|Exception|EXCEPTION", + "fatal|Fatal|FATAL", + "panic|Panic|PANIC", + "crash|Crash|CRASH", + "failed|Failed|FAILED", + "failure|Failure|FAILURE" + ], + "severity": 1 + }, + "AppFailure": { + "description": "Application-specific failure patterns", + "patterns": [ + "application.*failed", + "service.*unavailable", + "connection.*refused", + "timeout.*error", + "out.*of.*memory", + "disk.*full", + "permission.*denied", + "authentication.*failed", + "authorization.*failed" + ], + "severity": 1 + }, + "StackTrace": { + "description": "Stack trace patterns indicating application crashes", + "patterns": [ + "stack.*trace", + "at\\s+\\w+\\.\\w+", + "Exception.*in thread", + "java\\.lang\\.", + "python.*traceback", + "goroutine.*panic", + "panic:", + "fatal.*error" + ], + "severity": 1 + }, + "Connection": { + "description": "Connection and network related issues", + "patterns": [ + "connection.*reset", + "connection.*timeout", + "network.*unreachable", + "host.*unreachable", + "connection.*dropped", + "socket.*error", + "dns.*resolution.*failed" + ], + "severity": 2 + }, + "Timeout": { + "description": "Timeout related issues", + "patterns": [ + "request.*timeout", + "operation.*timeout", + "deadline.*exceeded", + "context.*timeout", + "read.*timeout", + "write.*timeout" + ], + "severity": 2 + }, + "Auth": { + "description": "Authentication and authorization issues", + "patterns": [ + "unauthorized", + "authentication.*error", + "invalid.*credentials", + "access.*denied", + "forbidden", + "token.*expired", + "certificate.*error" + ], + "severity": 2 + }, + "Exceptions": { + "description": "Various exception patterns", + "patterns": [ + "NullPointerException", + "IndexOutOfBoundsException", + "IllegalArgumentException", + "RuntimeException", + "SQLException", + "IOException" + ], + "severity": 2 + }, + "Resource": { + "description": "Resource constraint issues", + "patterns": [ + "resource.*exhausted", + "memory.*leak", + "cpu.*throttled", + "disk.*space.*low", + "quota.*exceeded", + "rate.*limit.*exceeded" + ], + "severity": 2 + }, + "HealthyRecovery": { + "description": "Recovery and healthy state patterns (lower severity)", + "patterns": [ + "recovered.*from.*error", + "connection.*restored", + "service.*back.*online", + "retry.*successful", + "health.*check.*passed" + ], + "severity": 4 + } + }, + "exclude_patterns": [ + "\\bINFO\\b", + "\\bDEBUG\\b", + "\\bTRACE\\b", + "health.*check", + "heartbeat", + "metrics", + "monitoring", + "\\],INFO\\s*,", + "INFO\\s*,c\\.", + "START\\s*-\\s*.*Impl\\.", + "BusinessService.*\\(\\)", + "RestrictionsApiDelegateImpl", + "ReadBlockedResourceListBusinessServiceImpl", + "LocationReplicaStrategy", + "\\bSTART\\s*method\\b", + "Calling\\s*BusinessService", + "BusinessServiceImpl", + "ApiDelegateImpl", + "successful.*startup", + "application.*started", + "server.*started", + "listening.*on.*port", + "configuration.*loaded", + "database.*connection.*established", + "linkerd.*INFO.*Connection closed.*error=read header from client timeout", + "linkerd_app_core::serve.*Connection closed.*error=read header from client timeout", + "TelemetryPipeline.*In the last \\d+ minutes.*operation has failed.*Sending telemetry to the ingestion service", + "adjustmentReason=\\w*EXCEPTION\\w*" + ], + "config": { + "max_matches_per_pattern": 10, + "case_sensitive": false, + "timeout_seconds": 30 + } +} \ No newline at end of file diff --git a/codebundles/k8s-applog-health/sli.robot b/codebundles/k8s-applog-health/sli.robot new file mode 100755 index 000000000..ce65ddeac --- /dev/null +++ b/codebundles/k8s-applog-health/sli.robot @@ -0,0 +1,371 @@ +*** Settings *** +Metadata Author akshayrw25 +Documentation This SLI uses kubectl to score application log health. Produces a value between 0 (completely failing the test) and 1 (fully passing the test). Looks for container restarts, critical log errors, pods not ready, deployment status, stacktraces and other recent events. +Metadata Display Name Kubernetes Application Log Healthcheck +Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift +Suite Setup Suite Initialization +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library RW.K8sLog +Library RW.LogAnalysis.ExtractTraceback + +Library OperatingSystem +Library String +Library Collections + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=The name of the Kubernetes namespace to scope actions and searching to. + ... pattern=\w* + ... example=my-namespace + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Which Kubernetes context to operate within. + ... pattern=\w* + ... example=my-main-cluster + ${WORKLOAD_TYPE}= RW.Core.Import User Variable WORKLOAD_TYPE + ... type=string + ... description=The type of Kubernetes workload to analyze. + ... pattern=\w* + ... enum=[deployment,statefulset,daemonset] + ... example=deployment + ... default=deployment + ${WORKLOAD_NAME}= RW.Core.Import User Variable WORKLOAD_NAME + ... type=string + ... description=The name of the Kubernetes workload to check. + ... pattern=\w* + ... example=my-workload + ${CONTAINER_RESTART_AGE}= RW.Core.Import User Variable CONTAINER_RESTART_AGE + ... type=string + ... description=The time window in minutes to search for container restarts. + ... pattern=((\d+?)m)? + ... example=10m + ... default=10m + ${CONTAINER_RESTART_THRESHOLD}= RW.Core.Import User Variable CONTAINER_RESTART_THRESHOLD + ... type=string + ... description=The maximum total container restarts to be still considered healthy. + ... pattern=^\d+$ + ... example=1 + ... default=1 + ${RW_LOOKBACK_WINDOW}= RW.Core.Import Platform Variable RW_LOOKBACK_WINDOW + ${RW_LOOKBACK_WINDOW}= RW.Core.Normalize Lookback Window ${RW_LOOKBACK_WINDOW} 2 + ${MAX_LOG_LINES}= RW.Core.Import User Variable MAX_LOG_LINES + ... type=string + ... description=Maximum number of log lines to fetch per container to prevent API overload. + ... pattern=^\d+$ + ... example=100 + ... default=1000 + ${MAX_LOG_BYTES}= RW.Core.Import User Variable MAX_LOG_BYTES + ... type=string + ... description=Maximum log size in bytes to fetch per container to prevent API overload. + ... pattern=^\d+$ + ... example=256000 + ... default=256000 + ${LOGS_EXCLUDE_PATTERN}= RW.Core.Import User Variable LOGS_EXCLUDE_PATTERN + ... type=string + ... description=Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. + ... pattern=.* + ... example="errors":\\s*\\[\\]|"warnings":\\s*\\[\\] + ... default="errors":\\\\s*\\\\[\\\\]|\\\\bINFO\\\\b|\\\\bDEBUG\\\\b|\\\\bTRACE\\\\b|\\\\bSTART\\\\s*-\\\\s*|\\\\bSTART\\\\s*method\\\\b + ${EXCLUDED_CONTAINER_NAMES}= RW.Core.Import User Variable EXCLUDED_CONTAINER_NAMES + ... type=string + ... description=Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). + ... pattern=.* + ... example=linkerd-proxy,istio-proxy,vault-agent + ... default=linkerd-proxy,istio-proxy,vault-agent + + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Which binary to use for Kubernetes CLI commands. + ... enum=[kubectl,oc] + ... example=kubectl + ... default=kubectl + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTAINER_RESTART_AGE} ${CONTAINER_RESTART_AGE} + Set Suite Variable ${CONTAINER_RESTART_THRESHOLD} ${CONTAINER_RESTART_THRESHOLD} + Set Suite Variable ${RW_LOOKBACK_WINDOW} ${RW_LOOKBACK_WINDOW} + Set Suite Variable ${MAX_LOG_LINES} ${MAX_LOG_LINES} + Set Suite Variable ${MAX_LOG_BYTES} ${MAX_LOG_BYTES} + Set Suite Variable ${LOGS_EXCLUDE_PATTERN} ${LOGS_EXCLUDE_PATTERN} + Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} ${EXCLUDED_CONTAINER_NAMES} + + # Convert comma-separated string to list + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END + Set Suite Variable @{EXCLUDED_CONTAINERS} + + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${WORKLOAD_NAME} ${WORKLOAD_NAME} + Set Suite Variable ${WORKLOAD_TYPE} ${WORKLOAD_TYPE} + Set Suite Variable ${env} {"KUBECONFIG":"./${kubeconfig.key}"} + + # Initialize score variables + Set Suite Variable ${container_restart_score} 0 + Set Suite Variable ${log_health_score} 0 + Set Suite Variable ${pods_notready_score} 0 + Set Suite Variable ${replica_score} 0 + Set Suite Variable ${events_score} 0 + + + # Check if workload is scaled to 0 and handle appropriately + # Different workload types have different field structures + + IF '${WORKLOAD_TYPE}' == 'daemonset' + # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + ELSE + IF '${WORKLOAD_TYPE}' == 'statefulset' + # StatefulSet: use current/updated replicas in addition to spec/ready + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), current_replicas: (.status.currentReplicas // 0), updated_replicas: (.status.updatedReplicas // 0)}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + ELSE + # For deployments + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + END + + TRY + ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json + ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) + + # Try to determine when deployment was scaled down by checking recent events and replica set history + ${scale_down_info}= Get Deployment Scale Down Timestamp ${spec_replicas} + + IF ${spec_replicas} == 0 + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas - returning special health score + Log Scale down detected at: ${scale_down_info} + + # For scaled-down workloads, return a score of 1.0 to indicate "intentionally down" vs "broken" + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} + Set Suite Variable ${SCALED_DOWN_INFO} ${scale_down_info} + ELSE + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + + EXCEPT + Log Warning: Failed to check workload scale, continuing with normal log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + END + +Get Deployment Scale Down Timestamp + [Arguments] ${spec_replicas} + [Documentation] Attempts to determine when a deployment was scaled down by examining recent events + ${scale_down_info}= Set Variable Unknown + + IF ${spec_replicas} == 0 + IF '${WORKLOAD_TYPE}' == 'deployment' + TRY + # Check recent scaling events to find when it was scaled to 0 + ${scaling_events}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.reason == "ScalingReplicaSet" and (.message | contains("${WORKLOAD_NAME}")) and (.message | contains("to 0"))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=15 + + IF '''${scaling_events.stdout}''' != '' + ${event_data}= Evaluate json.loads(r'''${scaling_events.stdout}''') if r'''${scaling_events.stdout}'''.strip() else {} json + ${timestamp}= Evaluate $event_data.get('timestamp', 'Unknown') + ${message}= Evaluate $event_data.get('message', 'Unknown') + ${scale_down_info}= Set Variable ${timestamp} (${message}) + Log Found scale-down event: ${scale_down_info} + ELSE + # Try checking replicaset history as fallback + ${rs_history}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get replicasets --context ${CONTEXT} -n ${NAMESPACE} -l app=${WORKLOAD_NAME} -o json | jq -r '.items[] | select(.spec.replicas == 0) | {creation_time: .metadata.creationTimestamp, name: .metadata.name}' | jq -s 'sort_by(.creation_time) | reverse | .[0] // empty' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=15 + + IF '''${rs_history.stdout}''' != '' + ${rs_data}= Evaluate json.loads(r'''${rs_history.stdout}''') if r'''${rs_history.stdout}'''.strip() else {} json + ${rs_time}= Evaluate $rs_data.get('creation_time', 'Unknown') + ${scale_down_info}= Set Variable Likely around ${rs_time} (based on ReplicaSet history) + Log Estimated scale-down time from ReplicaSet: ${scale_down_info} + ELSE + ${scale_down_info}= Set Variable Unable to determine - no recent scaling events found + Log Could not determine when ${WORKLOAD_TYPE} ${WORKLOAD_NAME} was scaled down + END + END + EXCEPT + Log Warning: Failed to determine scale-down timestamp + ${scale_down_info}= Set Variable Failed to determine scale-down time + END + ELSE IF '${WORKLOAD_TYPE}' == 'statefulset' + TRY + # StatefulSet: find scale-to-0 event via involvedObject + ${scaling_events}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.involvedObject.kind == "StatefulSet" and .involvedObject.name == "${WORKLOAD_NAME}" and (.message | contains("to 0") or (contains("delete Pod") and contains("successful")))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=15 + + IF '''${scaling_events.stdout}''' != '' + ${event_data}= Evaluate json.loads(r'''${scaling_events.stdout}''') if r'''${scaling_events.stdout}'''.strip() else {} json + ${timestamp}= Evaluate $event_data.get('timestamp', 'Unknown') + ${message}= Evaluate $event_data.get('message', 'Unknown') + ${scale_down_info}= Set Variable ${timestamp} (${message}) + Log Found scale-down event: ${scale_down_info} + ELSE + ${scale_down_info}= Set Variable Unable to determine - no recent scaling events found for StatefulSet + Log Could not determine when ${WORKLOAD_TYPE} ${WORKLOAD_NAME} was scaled down + END + EXCEPT + Log Warning: Failed to determine scale-down timestamp for StatefulSet + ${scale_down_info}= Set Variable Failed to determine scale-down time + END + END + END + + RETURN ${scale_down_info} + +*** Tasks *** +Get Critical Log Errors and Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + [Documentation] Fetches logs and checks for critical error patterns that indicate application failures. + [Tags] logs errors critical patterns + + # Skip if deployment is scaled down + IF ${SKIP_HEALTH_CHECKS} + Log Skipping log analysis - ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas + ${log_health_score}= Set Variable 1 # Perfect score for scaled deployment + Set Suite Variable ${log_health_score} + RW.Core.Push Metric ${log_health_score} sub_name=log_errors + ELSE + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${RW_LOOKBACK_WINDOW} + ... max_log_lines=${MAX_LOG_LINES} + ... max_log_bytes=${MAX_LOG_BYTES} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Use only critical error patterns for fast SLI checks + @{critical_categories}= Create List GenericError AppFailure + + ${scan_results}= RW.K8sLog.Scan Logs For Issues + ... log_dir=${log_dir} + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... categories=${critical_categories} + ... custom_patterns_file=sli_critical_patterns.json + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN + TRY + IF $LOGS_EXCLUDE_PATTERN != "" + ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re + ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} + Set Test Variable ${scan_results} ${filtered_results} + END + EXCEPT + Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results + END + + ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} + + # Store details for final score calculation logging + TRY + ${issues}= Evaluate $scan_results.get('issues', []) + ${issue_count}= Get Length ${issues} + Set Suite Variable ${log_health_details} ${issue_count} issues found + EXCEPT + Set Suite Variable ${log_health_details} analysis completed + END + + Set Suite Variable ${log_health_score} + RW.K8sLog.Cleanup Temp Files + RW.Core.Push Metric ${log_health_score} sub_name=log_errors + END + +Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + [Documentation] Checks for recent stacktraces/tracebacks related to the workload within a short time window, with filtering to reduce noise. + [Tags] stacktraces tracebacks errors recent fast + IF ${SKIP_HEALTH_CHECKS} + # For scaled-down deployments, return perfect score to indicate "intentionally down" vs "broken" + ${stacktrace_score}= Set Variable 1.0 + Set Suite Variable ${stacktrace_details} ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is intentionally scaled to 0 replicas - Score: ${stacktrace_score} + ELSE + # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${RW_LOOKBACK_WINDOW} + ... max_log_lines=${MAX_LOG_LINES} + ... max_log_bytes=${MAX_LOG_BYTES} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Extract stacktraces from the log directory + ${recentmost_stacktrace}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks + ... logs_dir=${log_dir} + ... fast_exit=${True} + + ${stacktrace_length}= Get Length ${recentmost_stacktrace} + + IF ${stacktrace_length} != 0 + # Stacktrace found - set score to 0 + ${stacktrace_score}= Set Variable 0 + ${delimiter}= Evaluate '-' * 150 + Set Suite Variable ${stacktrace_details} **Stacktrace(s) identified**:\n${delimiter}\n${recentmost_stacktrace}\n${delimiter} + ELSE + # No stacktraces found - set score to 1 + ${stacktrace_score}= Set Variable 1.0 + Set Suite Variable ${stacktrace_details} **No Stacktraces identified.**\n\nLog analysis completed successfully. + END + + # Clean up temporary log files + RW.K8sLog.Cleanup Temp Files + END + + Set Suite Variable ${stacktrace_score} + RW.Core.Push Metric ${stacktrace_score} sub_name=stacktrace_score + +Generate Application Health Score for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` + [Documentation] Generates the final applog health score and report details + [Tags] score health applog + + IF ${SKIP_HEALTH_CHECKS} + # For scaled-down deployments, return perfect score to indicate "intentionally down" vs "broken" + # We distinguish scaled-down vs broken deployments through the log message and report details + ${health_score}= Set Variable 1.0 + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} + ELSE + # Use the log health score as the final health score. + ${health_score}= Set Variable min(${log_health_score}, ${stacktrace_score}) + + IF ${health_score} == 1.0 + RW.Core.Add to Report Applog Health Score: ${health_score} - No applog issues or stacktraces detected in workload logs + ELSE + RW.Core.Add to Report Applog Health Score: ${health_score} - Applog issue(s) or stacktrace(s) detected in workload logs: ${log_health_details} + END + END + RW.Core.Push Metric ${health_score} \ No newline at end of file diff --git a/codebundles/k8s-applog-health/sli_critical_patterns.json b/codebundles/k8s-applog-health/sli_critical_patterns.json new file mode 100755 index 000000000..19848bb80 --- /dev/null +++ b/codebundles/k8s-applog-health/sli_critical_patterns.json @@ -0,0 +1,74 @@ +{ + "critical_patterns": { + "GenericError": { + "description": "Generic error patterns that indicate application failures", + "patterns": [ + "exception|Exception|EXCEPTION", + "fatal|Fatal|FATAL", + "panic|Panic|PANIC", + "crash|Crash|CRASH", + "failed|Failed|FAILED", + "failure|Failure|FAILURE" + ], + "severity": 1 + }, + "AppFailure": { + "description": "Application-specific failure patterns", + "patterns": [ + "application.*failed", + "service.*unavailable", + "connection.*refused", + "timeout.*error", + "out.*of.*memory", + "disk.*full", + "permission.*denied", + "authentication.*failed", + "authorization.*failed" + ], + "severity": 1 + }, + "StackTrace": { + "description": "Stack trace patterns indicating application crashes", + "patterns": [ + "stack.*trace", + "at\\s+\\w+\\.\\w+", + "Exception.*in thread", + "java\\.lang\\.", + "python.*traceback", + "goroutine.*panic", + "panic:", + "fatal.*error" + ], + "severity": 1 + } + }, + "exclude_patterns": [ + "\\bINFO\\b", + "\\bDEBUG\\b", + "\\bTRACE\\b", + "health.*check", + "heartbeat", + "metrics", + "monitoring", + "\\],INFO\\s*,", + "INFO\\s*,c\\.", + "START\\s*-\\s*.*Impl\\.", + "BusinessService.*\\(\\)", + "RestrictionsApiDelegateImpl", + "ReadBlockedResourceListBusinessServiceImpl", + "LocationReplicaStrategy", + "\\bSTART\\s*method\\b", + "Calling\\s*BusinessService", + "BusinessServiceImpl", + "ApiDelegateImpl", + "linkerd.*INFO.*Connection closed.*error=read header from client timeout", + "linkerd_app_core::serve.*Connection closed.*error=read header from client timeout", + "TelemetryPipeline.*In the last \\d+ minutes.*operation has failed.*Sending telemetry to the ingestion service", + "adjustmentReason=\\w*EXCEPTION\\w*" + ], + "config": { + "max_matches_per_pattern": 10, + "case_sensitive": false, + "timeout_seconds": 30 + } +} \ No newline at end of file diff --git a/codebundles/k8s-daemonset-healthcheck/runbook.robot b/codebundles/k8s-daemonset-healthcheck/runbook.robot index 8233cb0a9..e2ea9a654 100644 --- a/codebundles/k8s-daemonset-healthcheck/runbook.robot +++ b/codebundles/k8s-daemonset-healthcheck/runbook.robot @@ -20,114 +20,6 @@ Suite Setup Suite Initialization *** Tasks *** -Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the DaemonSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems. - [Tags] - ... logs - ... application - ... errors - ... patterns - ... health - ... daemonset - ... stacktrace - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - ... categories=@{LOG_PATTERN_CATEGORIES} - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Process each issue found in the logs - ${issues}= Evaluate $scan_results.get('issues', []) - FOR ${issue} IN @{issues} - ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) - IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} - # Use the full issue details directly without summarization to preserve all log content - ${issue_details_raw}= Evaluate $issue.get("details", "") - ${issue_details_str}= Convert To String ${issue_details_raw} - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${severity} - ... expected=Application logs should be free of critical errors for daemonset `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... actual=${issue.get('title', 'Log pattern issue detected')} in daemonset `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue.get('title', 'Log Pattern Issue')} in DaemonSet `${DAEMONSET_NAME}` - ... reproduce_hint=Check application logs for daemonset `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... details=${issue_details_str} - ... next_steps=${issue.get('next_steps', 'Review application logs and resolve underlying issues')} - ... observed_at=${issue_timestamp} - END - END - - ${issues_count}= Get Length ${issues} - - # Format scan results for better display - ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results} - - RW.Core.Add Pre To Report **Log Analysis Summary for DaemonSet `${DAEMONSET_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} - - RW.K8sLog.Cleanup Temp Files - -Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. - [Tags] - ... logs - ... anomalies - ... patterns - ... volume - ... daemonset - ... ${DAEMONSET_NAME} - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${anomaly_results}= RW.K8sLog.Analyze Log Anomalies - ... log_dir=${log_dir} - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - - # Process anomaly issues - ${anomaly_issues}= Evaluate $anomaly_results.get('issues', []) - IF len($anomaly_issues) > 0 - FOR ${issue} IN @{anomaly_issues} - ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue["details"]} - ${next_steps_text}= Catenate SEPARATOR=\n @{issue["next_steps"]} - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${issue["severity"]} - ... expected=No log anomalies should be present in DaemonSet `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... actual=Log anomalies detected in DaemonSet `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue["title"]} - ... reproduce_hint=Use RW.K8sLog.Analyze Log Anomalies keyword to reproduce this analysis - ... details=${summarized_details} - ... next_steps=${next_steps_text} - ... observed_at=${issue_timestamp} - END - END - - # Add summary to report - ${anomaly_summary}= Catenate SEPARATOR=\n @{anomaly_results["summary"]} - RW.Core.Add Pre To Report Log Anomaly Analysis for DaemonSet ${DAEMONSET_NAME}:\n${anomaly_summary} - - RW.K8sLog.Cleanup Temp Files Identify Recent Configuration Changes for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` [Documentation] Identifies recent configuration changes from ControllerRevision analysis that might be related to current issues. diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot index cde66123b..f825ea8fb 100755 --- a/codebundles/k8s-deployment-healthcheck/runbook.robot +++ b/codebundles/k8s-deployment-healthcheck/runbook.robot @@ -215,105 +215,6 @@ Suite Initialization *** Tasks *** -Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. - [Tags] - ... logs - ... application - ... errors - ... patterns - ... health - ... deployment - ... access:read-only - # Skip pod-related checks if deployment is scaled to 0 - IF not ${SKIP_POD_CHECKS} - # Temporarily suppress log warnings for excluded containers (they're expected) - TRY - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - ... excluded_containers=${EXCLUDED_CONTAINERS} - EXCEPT AS ${log_error} - # If log fetching fails completely, log the error but continue - Log Warning: Log fetching encountered an error: ${log_error} - # Set empty log directory to continue with other checks - ${log_dir}= Set Variable ${EMPTY} - END - - # Only scan logs if we have a valid log directory - IF '''${log_dir}''' != '''${EMPTY}''' - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... categories=@{LOG_PATTERN_CATEGORIES} - ... custom_patterns_file=runbook_patterns.json - ... excluded_containers=${EXCLUDED_CONTAINERS} - ELSE - # Create empty scan results if no logs were fetched - ${scan_results}= Evaluate {"issues": [], "summary": ["No logs available for analysis"]} - END - - # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN - TRY - IF $LOGS_EXCLUDE_PATTERN != "" - ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re - ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} - Set Test Variable ${scan_results} ${filtered_results} - END - EXCEPT - Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results - END - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Process each issue found in the logs - ${issues}= Evaluate $scan_results.get('issues', []) - FOR ${issue} IN @{issues} - ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) - IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} - # Convert issue details to string to avoid serialization issues - ${issue_details_raw}= Evaluate $issue.get("details", "") - ${issue_details_str}= Convert To String ${issue_details_raw} - ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue_details_str} - - # Safely extract title and next_steps as strings - ${issue_title_raw}= Evaluate $issue.get('title', 'Log pattern issue detected') - ${issue_title}= Convert To String ${issue_title_raw} - ${next_steps_raw}= Evaluate $issue.get('next_steps', 'Review application logs and resolve underlying issues') - ${next_steps}= Convert To String ${next_steps_raw} - - # Use timestamp from log scan results if available, otherwise extract from details - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${severity} - ... expected=Application logs should be free of critical errors for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... actual=${issue_title} in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... title=${issue_title} in Deployment `${DEPLOYMENT_NAME}` - ... reproduce_hint=Check application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... details=${summarized_details} - ... next_steps=${next_steps} - ... observed_at=${issue_timestamp} - END - END - - ${issues_count}= Get Length ${issues} - - # Convert scan_results to string to avoid serialization issues, then format for display - ${scan_results_str}= Evaluate json.dumps($scan_results, indent=2) json - ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results_str} - - RW.Core.Add Pre To Report **Log Analysis Summary for Deployment `${DEPLOYMENT_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} - - RW.K8sLog.Cleanup Temp Files - END - Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` [Documentation] Analyzes Kubernetes event patterns to identify anomalies such as sudden spikes in event rates, unusual patterns, or recurring issues that might indicate underlying problems with controllers, resources, or deployments. [Tags] @@ -388,71 +289,6 @@ Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMES END END -Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and displays deployment logs in the report for manual review. Note: Issues are not created by this task - see "Analyze Application Log Patterns" for automated issue detection. - [Tags] - ... logs - ... collection - ... deployment - ... troubleshooting - ... access:read-only - # Skip pod-related checks if deployment is scaled to 0 - IF not ${SKIP_POD_CHECKS} - # Fetch raw logs - ${deployment_logs}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} logs deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... show_in_rwl_cheatsheet=true - ... render_in_commandlist=true - - IF ${deployment_logs.returncode} == 0 - # Filter logs to remove repetitive health check messages and focus on meaningful content - ${filtered_logs}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | grep -E "(error|ERROR|warn|WARN|exception|Exception|fail|FAIL|fatal|FATAL|panic|stack|trace|timeout|connection.*refused|unable.*connect|authentication.*failed|denied|forbidden|unauthorized|500|502|503|504)" | tail -50 || echo "No significant errors or warnings found in recent logs" - ... env=${env} - ... include_in_history=false - - # Also get a sample of non-health-check logs for context - ${context_logs}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | head -20 | tail -10 - ... env=${env} - ... include_in_history=false - - ${history}= RW.CLI.Pop Shell History - - # Determine if logs are mostly health checks - ${total_lines}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | wc -l - ... env=${env} - ... include_in_history=false - - ${health_check_lines}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -E "(Checking.*Health|Health.*Check|healthcheck|/health)" | wc -l - ... env=${env} - ... include_in_history=false - - # Handle empty output from wc -l by providing default values - ${total_lines_clean}= Set Variable If "${total_lines.stdout.strip()}" == "" 0 ${total_lines.stdout.strip()} - ${health_check_lines_clean}= Set Variable If "${health_check_lines.stdout.strip()}" == "" 0 ${health_check_lines.stdout.strip()} - - ${total_count}= Convert To Integer ${total_lines_clean} - ${health_count}= Convert To Integer ${health_check_lines_clean} - - # Create consolidated logs report - IF ${health_count} > ${total_count} * 0.8 - ${log_content}= Set Variable If "${context_logs.stdout.strip()}" != "" **🔍 Filtered Error/Warning Logs:**\n${filtered_logs.stdout}\n\n**📝 Sample Application Logs (Non-Health Check):**\n${context_logs.stdout} **🔍 Filtered Error/Warning Logs:**\n${filtered_logs.stdout} - RW.Core.Add Pre To Report **📋 Raw Deployment Logs for `${DEPLOYMENT_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n**ℹ️ Logs are mostly health check messages (${health_count}/${total_count} lines)**\n\n${log_content}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. - ELSE - RW.Core.Add Pre To Report **📋 Raw Deployment Logs for `${DEPLOYMENT_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n\n**📝 Recent Application Logs:**\n${deployment_logs.stdout}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. - END - ELSE - # Only add to report if fetch failed, don't create issue - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report **📋 Raw Deployment Logs for `${DEPLOYMENT_NAME}`**\n\n⚠️ Unable to fetch deployment logs (exit code ${deployment_logs.returncode}).\n\n**STDERR:** ${deployment_logs.stderr}\n\n**Commands Used:** ${history} - END - END - Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` [Documentation] Validates if a Liveness probe has possible misconfigurations [Tags] diff --git a/codebundles/k8s-deployment-healthcheck/sli.robot b/codebundles/k8s-deployment-healthcheck/sli.robot index 7571c55e0..ef4f08444 100755 --- a/codebundles/k8s-deployment-healthcheck/sli.robot +++ b/codebundles/k8s-deployment-healthcheck/sli.robot @@ -129,7 +129,6 @@ Suite Initialization # Initialize score variables Set Suite Variable ${container_restart_score} 0 - Set Suite Variable ${log_health_score} 0 Set Suite Variable ${pods_notready_score} 0 Set Suite Variable ${replica_score} 0 Set Suite Variable ${events_score} 0 @@ -246,67 +245,6 @@ Get Container Restarts and Score for Deployment `${DEPLOYMENT_NAME}` RW.Core.Push Metric ${container_restart_score} sub_name=container_restarts END -Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` - [Documentation] Fetches logs and checks for critical error patterns that indicate application failures. - [Tags] logs errors critical patterns - - # Skip if deployment is scaled down - IF ${SKIP_HEALTH_CHECKS} - Log Skipping log analysis - deployment is scaled to 0 replicas - ${log_health_score}= Set Variable 1 # Perfect score for scaled deployment - Set Suite Variable ${log_health_score} - RW.Core.Push Metric ${log_health_score} sub_name=log_errors - ELSE - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${RW_LOOKBACK_WINDOW} - ... max_log_lines=${MAX_LOG_LINES} - ... max_log_bytes=${MAX_LOG_BYTES} - ... excluded_containers=${EXCLUDED_CONTAINERS} - - # Use only critical error patterns for fast SLI checks - @{critical_categories}= Create List GenericError AppFailure - - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... categories=${critical_categories} - ... custom_patterns_file=sli_critical_patterns.json - ... excluded_containers=${EXCLUDED_CONTAINERS} - - # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN - TRY - IF $LOGS_EXCLUDE_PATTERN != "" - ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re - ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} - Set Test Variable ${scan_results} ${filtered_results} - END - EXCEPT - Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results - END - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Store details for final score calculation logging - TRY - ${issues}= Evaluate $scan_results.get('issues', []) - ${issue_count}= Get Length ${issues} - Set Suite Variable ${log_health_details} ${issue_count} issues found - EXCEPT - Set Suite Variable ${log_health_details} analysis completed - END - - Set Suite Variable ${log_health_score} - RW.K8sLog.Cleanup Temp Files - RW.Core.Push Metric ${log_health_score} sub_name=log_errors - END - Get NotReady Pods Score for Deployment `${DEPLOYMENT_NAME}` [Documentation] Fetches a count of unready pods for the specific deployment. [Tags] access:read-only Pods Status Phase Ready Unready Running @@ -411,13 +349,12 @@ Generate Deployment Health Score for `${DEPLOYMENT_NAME}` Log Deployment ${DEPLOYMENT_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} ELSE # Calculate the normal health score - ${active_checks}= Set Variable 5 - ${deployment_health_score}= Evaluate (${container_restart_score} + ${log_health_score} + ${pods_notready_score} + ${replica_score} + ${events_score}) / ${active_checks} + ${active_checks}= Set Variable 4 + ${deployment_health_score}= Evaluate (${container_restart_score} + ${pods_notready_score} + ${replica_score} + ${events_score}) / ${active_checks} ${health_score}= Convert to Number ${deployment_health_score} 2 # Create a single line showing unhealthy components IF ${container_restart_score} < 1 Append To List ${unhealthy_components} Container Restarts (${container_restart_details}) - IF ${log_health_score} < 0.8 Append To List ${unhealthy_components} Log Health (${log_health_details}) IF ${pods_notready_score} < 1 Append To List ${unhealthy_components} Pod Readiness (${pod_readiness_details}) IF ${replica_score} < 1 Append To List ${unhealthy_components} Replica Status (${replica_details}) IF ${events_score} < 1 Append To List ${unhealthy_components} Warning Events (${events_details}) diff --git a/codebundles/k8s-statefulset-healthcheck/runbook.robot b/codebundles/k8s-statefulset-healthcheck/runbook.robot index b5394e3fe..3a8fa9cde 100644 --- a/codebundles/k8s-statefulset-healthcheck/runbook.robot +++ b/codebundles/k8s-statefulset-healthcheck/runbook.robot @@ -20,114 +20,6 @@ Suite Setup Suite Initialization *** Tasks *** -Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the StatefulSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems. - [Tags] - ... logs - ... application - ... errors - ... patterns - ... health - ... statefulset - ... stacktrace - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - ... categories=@{LOG_PATTERN_CATEGORIES} - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Process each issue found in the logs - ${issues}= Evaluate $scan_results.get('issues', []) - FOR ${issue} IN @{issues} - ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) - IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} - # Use the full issue details directly without summarization to preserve all log content - ${issue_details_raw}= Evaluate $issue.get("details", "") - ${issue_details_str}= Convert To String ${issue_details_raw} - # Use timestamp from log scan results if available, otherwise extract from details - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${severity} - ... expected=Application logs should be free of critical errors for statefulset `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... actual=${issue.get('title', 'Log pattern issue detected')} in statefulset `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue.get('title', 'Log Pattern Issue')} in StatefulSet `${STATEFULSET_NAME}` - ... reproduce_hint=Check application logs for statefulset `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... details=${issue_details_str} - ... next_steps=${issue.get('next_steps', 'Review application logs and resolve underlying issues')} - ... observed_at=${issue_timestamp} - END - END - - ${issues_count}= Get Length ${issues} - - # Format scan results for better display - ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results} - - RW.Core.Add Pre To Report **Log Analysis Summary for StatefulSet `${STATEFULSET_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} - - RW.K8sLog.Cleanup Temp Files - -Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. - [Tags] - ... logs - ... anomalies - ... patterns - ... volume - ... statefulset - ... ${STATEFULSET_NAME} - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${anomaly_results}= RW.K8sLog.Analyze Log Anomalies - ... log_dir=${log_dir} - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - - # Process anomaly issues - ${anomaly_issues}= Evaluate $anomaly_results.get('issues', []) - IF len($anomaly_issues) > 0 - FOR ${issue} IN @{anomaly_issues} - ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue["details"]} - ${next_steps_text}= Catenate SEPARATOR=\n @{issue["next_steps"]} - - RW.Core.Add Issue - ... severity=${issue["severity"]} - ... expected=No log anomalies should be present in StatefulSet `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... actual=Log anomalies detected in StatefulSet `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue["title"]} - ... reproduce_hint=Use RW.K8sLog.Analyze Log Anomalies keyword to reproduce this analysis - ... details=${summarized_details} - ... next_steps=${next_steps_text} - ... observed_at=${issue["observed_at"]} - END - END - - # Add summary to report - ${anomaly_summary}= Catenate SEPARATOR=\n @{anomaly_results["summary"]} - RW.Core.Add Pre To Report Log Anomaly Analysis for StatefulSet ${STATEFULSET_NAME}:\n${anomaly_summary} - - RW.K8sLog.Cleanup Temp Files Check Liveness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` [Documentation] Validates if a Liveness probe has possible misconfigurations