From 97ffeb385243a086d197d046d9a724e93e03bab0 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Wed, 14 Jan 2026 13:37:28 +0530 Subject: [PATCH 01/10] RWENGG-1350: initial writeup of the applog health codebundle --- .../generation-rules/k8s-applog-health.yaml | 21 ++ .../templates/k8s-applog-health-sli.yaml | 59 ++++ .../templates/k8s-applog-health-slx.yaml | 25 ++ .../templates/k8s-applog-health-taskset.yaml | 44 +++ codebundles/k8s-applog-health/README.md | 0 codebundles/k8s-applog-health/runbook.robot | 308 ++++++++++++++++++ .../k8s-applog-health/runbook_patterns.json | 153 +++++++++ codebundles/k8s-applog-health/sli.robot | 296 +++++++++++++++++ .../sli_critical_patterns.json | 74 +++++ 9 files changed, 980 insertions(+) create mode 100644 codebundles/k8s-applog-health/.runwhen/generation-rules/k8s-applog-health.yaml create mode 100755 codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml create mode 100644 codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml create mode 100644 codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml create mode 100644 codebundles/k8s-applog-health/README.md create mode 100755 codebundles/k8s-applog-health/runbook.robot create mode 100644 codebundles/k8s-applog-health/runbook_patterns.json create mode 100755 codebundles/k8s-applog-health/sli.robot create mode 100755 codebundles/k8s-applog-health/sli_critical_patterns.json diff --git a/codebundles/k8s-applog-health/.runwhen/generation-rules/k8s-applog-health.yaml b/codebundles/k8s-applog-health/.runwhen/generation-rules/k8s-applog-health.yaml new file mode 100644 index 000000000..250026612 --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/generation-rules/k8s-applog-health.yaml @@ -0,0 +1,21 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + - resourceTypes: + - deployment + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + slxs: + - baseName: applog-health + levelOfDetail: detailed + qualifiers: ["resource", "namespace", "cluster"] + baseTemplateName: k8s-applog-health + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: k8s-applog-health-taskset.yaml diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml new file mode 100755 index 000000000..8576f2825 --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml @@ -0,0 +1,59 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} + runwhen.com/sli: "true" +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{ default_location }} + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-applog-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 600 + description: Measures the health of the application logs for the {{match_resource.resource.metadata.name}} deployment. + configProvided: + - name: NAMESPACE + value: {{match_resource.resource.metadata.namespace}} + - name: CONTEXT + value: {{context}} + - name: KUBERNETES_DISTRIBUTION_BINARY + value: {{custom.kubernetes_distribution_binary | default("kubectl")}} + - name: DEPLOYMENT_NAME + value: {{match_resource.resource.metadata.name}} + - name: CONTAINER_RESTART_AGE + value: "10m" + - name: CONTAINER_RESTART_THRESHOLD + value: "2" + - name: EVENT_AGE + value: "10m" + - name: EVENT_THRESHOLD + value: "2" + - name: CHECK_SERVICE_ENDPOINTS + value: "true" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name}} + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m \ No newline at end of file diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml new file mode 100644 index 000000000..7ad748284 --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml @@ -0,0 +1,25 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/deploy.svg + alias: {{match_resource.resource.metadata.name}} Application Log Health + asMeasuredBy: The presence of application-level errors/issues/stacktraces in the application logs indicating runtime errors or exceptions in {{match_resource.resource.metadata.name}}. + configProvided: + - name: OBJECT_NAME + value: {{match_resource.resource.metadata.name}} + owners: + - {{workspace.owner_email}} + statement: Application logs for {{match_resource.resource.metadata.name}} should be free of critical errors/issues/stacktraces indicating runtime errors or exceptions. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: access + value: read-only \ No newline at end of file diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml new file mode 100644 index 000000000..7f1d9b79e --- /dev/null +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml @@ -0,0 +1,44 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-applog-health/runbook.robot + configProvided: + - name: NAMESPACE + value: {{match_resource.resource.metadata.namespace}} + - name: CONTEXT + value: {{context}} + - name: KUBERNETES_DISTRIBUTION_BINARY + value: {{custom.kubernetes_distribution_binary}} + - name: DEPLOYMENT_NAME + value: {{match_resource.resource.metadata.name}} + - name: CONTAINER_RESTART_AGE + value: "30m" + - name: CONTAINER_RESTART_THRESHOLD + value: "4" + - name: LOG_AGE + value: "10m" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name}} + {% endif %} \ No newline at end of file diff --git a/codebundles/k8s-applog-health/README.md b/codebundles/k8s-applog-health/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot new file mode 100755 index 000000000..88a57a05e --- /dev/null +++ b/codebundles/k8s-applog-health/runbook.robot @@ -0,0 +1,308 @@ +*** Settings *** +Documentation Triages issues related to a deployment and its replicas. +Metadata Author stewartshea +Metadata Display Name Kubernetes Deployment Triage +Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library RW.NextSteps +Library RW.K8sHelper +Library RW.K8sLog + +Library OperatingSystem +Library String +Library Collections +Library DateTime + +Suite Setup Suite Initialization + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Which binary to use for Kubernetes CLI commands. + ... pattern=\w* + ... enum=[kubectl,oc] + ... example=kubectl + ... default=kubectl + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Which Kubernetes context to operate within. + ... pattern=\w* + ... example=my-main-cluster + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=The name of the Kubernetes namespace to scope actions and searching to. + ... pattern=\w* + ... example=otel-demo + ${DEPLOYMENT_NAME}= RW.Core.Import User Variable DEPLOYMENT_NAME + ... type=string + ... description=The name of the deployment to triage. + ... pattern=\w* + ... example=otel-demo-frontend + ${LOG_LINES}= RW.Core.Import User Variable LOG_LINES + ... type=string + ... description=The number of log lines to fetch from the pods when inspecting logs. + ... pattern=\d+ + ... example=100 + ... default=100 + ${LOG_AGE}= RW.Core.Import User Variable LOG_AGE + ... type=string + ... description=The age of logs to fetch from pods, used for log analysis tasks. + ... pattern=\w* + ... example=10m + ... default=10m + + ${LOG_ANALYSIS_DEPTH}= RW.Core.Import User Variable LOG_ANALYSIS_DEPTH + ... type=string + ... description=The depth of log analysis to perform - basic, standard, or comprehensive. + ... pattern=\w* + ... enum=[basic,standard,comprehensive] + ... example=standard + ... default=standard + ${LOG_SEVERITY_THRESHOLD}= RW.Core.Import User Variable LOG_SEVERITY_THRESHOLD + ... type=string + ... description=The minimum severity level for creating issues (1=critical, 2=high, 3=medium, 4=low, 5=info). + ... pattern=\d+ + ... example=3 + ... default=3 + ${LOG_PATTERN_CATEGORIES_STR}= RW.Core.Import User Variable LOG_PATTERN_CATEGORIES + ... type=string + ... description=Comma-separated list of log pattern categories to scan for. + ... pattern=.* + ... example=GenericError,AppFailure,Connection + ... default=GenericError,AppFailure,Connection,Timeout,Auth,Exceptions,Resource,HealthyRecovery + ${ANOMALY_THRESHOLD}= RW.Core.Import User Variable ANOMALY_THRESHOLD + ... type=string + ... description=The threshold for detecting event anomalies based on events per minute. + ... pattern=\d+ + ... example=5 + ... default=5 + ${LOGS_ERROR_PATTERN}= RW.Core.Import User Variable LOGS_ERROR_PATTERN + ... type=string + ... description=The error pattern to use when grep-ing logs. + ... pattern=\w* + ... example=(Error: 13|Error: 14) + ... default=error|ERROR + ${LOGS_EXCLUDE_PATTERN}= RW.Core.Import User Variable LOGS_EXCLUDE_PATTERN + ... type=string + ... description=Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. + ... pattern=.* + ... example="errors":\s*\[\]|"warnings":\s*\[\] + ... default="errors":\\s*\\[\\]|\\bINFO\\b|\\bDEBUG\\b|\\bTRACE\\b|\\bSTART\\s*-\\s*|\\bSTART\\s*method\\b + ${LOG_SCAN_TIMEOUT}= RW.Core.Import User Variable LOG_SCAN_TIMEOUT + ... type=string + ... description=Timeout in seconds for log scanning operations. Increase this value if log scanning times out on large log files. + ... pattern=\d+ + ... example=300 + ... default=300 + ${EXCLUDED_CONTAINER_NAMES}= RW.Core.Import User Variable EXCLUDED_CONTAINER_NAMES + ... type=string + ... description=Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). + ... pattern=.* + ... example=linkerd-proxy,istio-proxy,vault-agent + ... default=linkerd-proxy,istio-proxy,vault-agent + + ${CONTAINER_RESTART_AGE}= RW.Core.Import User Variable CONTAINER_RESTART_AGE + ... type=string + ... description=The time window (in (h) hours or (m) minutes) to search for container restarts. Only containers that restarted within this time period will be reported. + ... pattern=\w* + ... example=10m + ... default=10m + ${CONTAINER_RESTART_THRESHOLD}= RW.Core.Import User Variable CONTAINER_RESTART_THRESHOLD + ... type=string + ... description=The minimum number of restarts required to trigger an issue. Containers with restart counts below this threshold will be ignored. + ... pattern=\d+ + ... example=1 + ... default=1 + # Convert comma-separated strings to lists + @{LOG_PATTERN_CATEGORIES}= Split String ${LOG_PATTERN_CATEGORIES_STR} , + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END + + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTEXT} + Set Suite Variable ${NAMESPACE} + Set Suite Variable ${DEPLOYMENT_NAME} + Set Suite Variable ${LOG_LINES} + Set Suite Variable ${LOG_AGE} + + Set Suite Variable ${LOG_ANALYSIS_DEPTH} + Set Suite Variable ${LOG_SEVERITY_THRESHOLD} + Set Suite Variable ${LOG_PATTERN_CATEGORIES_STR} + Set Suite Variable @{LOG_PATTERN_CATEGORIES} + Set Suite Variable ${ANOMALY_THRESHOLD} + Set Suite Variable ${LOGS_ERROR_PATTERN} + Set Suite Variable ${LOGS_EXCLUDE_PATTERN} + Set Suite Variable ${LOG_SCAN_TIMEOUT} + Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} + Set Suite Variable @{EXCLUDED_CONTAINERS} + + Set Suite Variable ${CONTAINER_RESTART_AGE} + Set Suite Variable ${CONTAINER_RESTART_THRESHOLD} + # Construct environment dictionary safely to handle special characters in regex patterns + &{env_dict}= Create Dictionary + ... KUBECONFIG=${kubeconfig.key} + ... KUBERNETES_DISTRIBUTION_BINARY=${KUBERNETES_DISTRIBUTION_BINARY} + ... CONTEXT=${CONTEXT} + ... NAMESPACE=${NAMESPACE} + ... LOGS_ERROR_PATTERN=${LOGS_ERROR_PATTERN} + ... LOGS_EXCLUDE_PATTERN=${LOGS_EXCLUDE_PATTERN} + ... ANOMALY_THRESHOLD=${ANOMALY_THRESHOLD} + ... DEPLOYMENT_NAME=${DEPLOYMENT_NAME} + ... CONTAINER_RESTART_AGE=${CONTAINER_RESTART_AGE} + ... CONTAINER_RESTART_THRESHOLD=${CONTAINER_RESTART_THRESHOLD} + ... LOG_SCAN_TIMEOUT=${LOG_SCAN_TIMEOUT} + Set Suite Variable ${env} ${env_dict} + + # Check if deployment is scaled to 0 and handle appropriately + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + + TRY + ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json + ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) + + IF ${spec_replicas} == 0 + ${issue_timestamp}= DateTime.Get Current Date + RW.Core.Add Issue + ... severity=4 + ... expected=Deployment `${DEPLOYMENT_NAME}` operational status documented + ... actual=Deployment `${DEPLOYMENT_NAME}` is intentionally scaled to zero replicas + ... title=Deployment `${DEPLOYMENT_NAME}` is Scaled Down (Informational) + ... reproduce_hint=kubectl get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o yaml + ... details=Deployment `${DEPLOYMENT_NAME}` is currently scaled to 0 replicas (spec.replicas=0). This is an intentional configuration and not an error. All pod-related healthchecks have been skipped for efficiency. If the deployment should be running, scale it up using:\nkubectl scale deployment/${DEPLOYMENT_NAME} --replicas= --context ${CONTEXT} -n ${NAMESPACE} + ... next_steps=This is informational only. If the deployment should be running, scale it up. + ... observed_at=${issue_timestamp} + + RW.Core.Add Pre To Report **ā„¹ļø Deployment `${DEPLOYMENT_NAME}` is scaled to 0 replicas - Skipping pod-related checks**\n**Available Condition:** ${scale_status.get('available_condition', 'Unknown')} + + Set Suite Variable ${SKIP_POD_CHECKS} ${True} + ELSE + Set Suite Variable ${SKIP_POD_CHECKS} ${False} + END + + EXCEPT + Log Warning: Failed to check deployment scale, continuing with normal checks + Set Suite Variable ${SKIP_POD_CHECKS} ${False} + END + + +*** Tasks *** + +Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. + [Tags] + ... logs + ... application + ... errors + ... stacktrace + ... patterns + ... health + ... deployment + ... access:read-only + # Skip pod-related checks if deployment is scaled to 0 + IF not ${SKIP_POD_CHECKS} + # Temporarily suppress log warnings for excluded containers (they're expected) + TRY + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=deployment + ... workload_name=${DEPLOYMENT_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${LOG_AGE} + ... excluded_containers=${EXCLUDED_CONTAINERS} + EXCEPT AS ${log_error} + # If log fetching fails completely, log the error but continue + Log Warning: Log fetching encountered an error: ${log_error} + # Set empty log directory to continue with other checks + ${log_dir}= Set Variable ${EMPTY} + END + + # Only scan logs if we have a valid log directory + IF '''${log_dir}''' != '''${EMPTY}''' + ${scan_results}= RW.K8sLog.Scan Logs For Issues + ... log_dir=${log_dir} + ... workload_type=deployment + ... workload_name=${DEPLOYMENT_NAME} + ... namespace=${NAMESPACE} + ... categories=@{LOG_PATTERN_CATEGORIES} + ... custom_patterns_file=runbook_patterns.json + ... excluded_containers=${EXCLUDED_CONTAINERS} + ELSE + # Create empty scan results if no logs were fetched + ${scan_results}= Evaluate {"issues": [], "summary": ["No logs available for analysis"]} + END + + # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN + TRY + IF $LOGS_EXCLUDE_PATTERN != "" + ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re + ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} + Set Test Variable ${scan_results} ${filtered_results} + END + EXCEPT + Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results + END + + ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} + + # Process each issue found in the logs + ${issues}= Evaluate $scan_results.get('issues', []) + ${issues_count}= Get Length ${issues} + + # print the contents from log_dir into the report + RW.Core.Add Pre To Report **Log Contents:**\n${log_dir} + + IF ${issues_count} == 0 + # create a dummy issue with a keyword argument set to a value depicting no issues found + RW.Core.Add Pre To Report **No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`** + + # create a dummy issue with a keyword argument set to a value depicting no issues found + RW.Core.Add Issue + ... severity=4 + ... expected=Application logs should be free of critical errors for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... actual=No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... title=No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... reproduce_hint=Check application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... details=No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... next_steps=No processing required + ... observed_at=${issue_timestamp} + ... next_action=noIssuesFound + ELSE + # set issue_timestamp to the observed_at value from the first issue + ${issue_timestamp}= Evaluate $issues[0].get('observed_at', '') + + # create a dummy issue with a keyword argument set to a value depicting issues found + RW.Core.Add Issue + ... severity=4 + ... expected=Application logs should be free of critical errors for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... actual=Issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... title=Issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... reproduce_hint=Check application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... details=Issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` + ... next_steps=Process the issues found in the application logs + ... observed_at=${issue_timestamp} + ... next_action=processApplogIssues + END + # RW.K8sLog.Cleanup Temp Files + END \ No newline at end of file diff --git a/codebundles/k8s-applog-health/runbook_patterns.json b/codebundles/k8s-applog-health/runbook_patterns.json new file mode 100644 index 000000000..12c9ac4d4 --- /dev/null +++ b/codebundles/k8s-applog-health/runbook_patterns.json @@ -0,0 +1,153 @@ +{ + "critical_patterns": { + "GenericError": { + "description": "Generic error patterns that indicate application failures", + "patterns": [ + "exception|Exception|EXCEPTION", + "fatal|Fatal|FATAL", + "panic|Panic|PANIC", + "crash|Crash|CRASH", + "failed|Failed|FAILED", + "failure|Failure|FAILURE" + ], + "severity": 1 + }, + "AppFailure": { + "description": "Application-specific failure patterns", + "patterns": [ + "application.*failed", + "service.*unavailable", + "connection.*refused", + "timeout.*error", + "out.*of.*memory", + "disk.*full", + "permission.*denied", + "authentication.*failed", + "authorization.*failed" + ], + "severity": 1 + }, + "StackTrace": { + "description": "Stack trace patterns indicating application crashes", + "patterns": [ + "stack.*trace", + "at\\s+\\w+\\.\\w+", + "Exception.*in thread", + "java\\.lang\\.", + "python.*traceback", + "goroutine.*panic", + "panic:", + "fatal.*error" + ], + "severity": 1 + }, + "Connection": { + "description": "Connection and network related issues", + "patterns": [ + "connection.*reset", + "connection.*timeout", + "network.*unreachable", + "host.*unreachable", + "connection.*dropped", + "socket.*error", + "dns.*resolution.*failed" + ], + "severity": 2 + }, + "Timeout": { + "description": "Timeout related issues", + "patterns": [ + "request.*timeout", + "operation.*timeout", + "deadline.*exceeded", + "context.*timeout", + "read.*timeout", + "write.*timeout" + ], + "severity": 2 + }, + "Auth": { + "description": "Authentication and authorization issues", + "patterns": [ + "unauthorized", + "authentication.*error", + "invalid.*credentials", + "access.*denied", + "forbidden", + "token.*expired", + "certificate.*error" + ], + "severity": 2 + }, + "Exceptions": { + "description": "Various exception patterns", + "patterns": [ + "NullPointerException", + "IndexOutOfBoundsException", + "IllegalArgumentException", + "RuntimeException", + "SQLException", + "IOException" + ], + "severity": 2 + }, + "Resource": { + "description": "Resource constraint issues", + "patterns": [ + "resource.*exhausted", + "memory.*leak", + "cpu.*throttled", + "disk.*space.*low", + "quota.*exceeded", + "rate.*limit.*exceeded" + ], + "severity": 2 + }, + "HealthyRecovery": { + "description": "Recovery and healthy state patterns (lower severity)", + "patterns": [ + "recovered.*from.*error", + "connection.*restored", + "service.*back.*online", + "retry.*successful", + "health.*check.*passed" + ], + "severity": 4 + } + }, + "exclude_patterns": [ + "\\bINFO\\b", + "\\bDEBUG\\b", + "\\bTRACE\\b", + "health.*check", + "heartbeat", + "metrics", + "monitoring", + "\\],INFO\\s*,", + "INFO\\s*,c\\.", + "START\\s*-\\s*.*Impl\\.", + "BusinessService.*\\(\\)", + "RestrictionsApiDelegateImpl", + "ReadBlockedResourceListBusinessServiceImpl", + "LocationReplicaStrategy", + "\\bSTART\\s*method\\b", + "Calling\\s*BusinessService", + "BusinessServiceImpl", + "ApiDelegateImpl", + "successful.*startup", + "application.*started", + "server.*started", + "listening.*on.*port", + "configuration.*loaded", + "database.*connection.*established", + "linkerd.*INFO.*Connection closed.*error=read header from client timeout", + "linkerd_app_core::serve.*Connection closed.*error=read header from client timeout", + "TelemetryPipeline.*In the last \\d+ minutes.*operation has failed.*Sending telemetry to the ingestion service", + "adjustmentReason=\\w*EXCEPTION\\w*" + ], + "config": { + "max_matches_per_pattern": 10, + "case_sensitive": false, + "timeout_seconds": 30 + } +} \ No newline at end of file diff --git a/codebundles/k8s-applog-health/sli.robot b/codebundles/k8s-applog-health/sli.robot new file mode 100755 index 000000000..496903cd8 --- /dev/null +++ b/codebundles/k8s-applog-health/sli.robot @@ -0,0 +1,296 @@ +*** Settings *** +Metadata Author stewartshea +Documentation This SLI uses kubectl to score deployment health. Produces a value between 0 (completely failing the test) and 1 (fully passing the test). Looks for container restarts, critical log errors, pods not ready, deployment status, and recent events. +Metadata Display Name Kubernetes Deployment Healthcheck +Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift +Suite Setup Suite Initialization +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library RW.K8sLog + +Library OperatingSystem +Library String +Library Collections + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ... example=For examples, start here https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=The name of the Kubernetes namespace to scope actions and searching to. + ... pattern=\w* + ... example=my-namespace + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Which Kubernetes context to operate within. + ... pattern=\w* + ... example=my-main-cluster + ${DEPLOYMENT_NAME}= RW.Core.Import User Variable DEPLOYMENT_NAME + ... type=string + ... description=The name of the Kubernetes deployment to check. + ... pattern=\w* + ... example=my-deployment + ${CONTAINER_RESTART_AGE}= RW.Core.Import User Variable CONTAINER_RESTART_AGE + ... type=string + ... description=The time window in minutes to search for container restarts. + ... pattern=((\d+?)m)? + ... example=10m + ... default=10m + ${CONTAINER_RESTART_THRESHOLD}= RW.Core.Import User Variable CONTAINER_RESTART_THRESHOLD + ... type=string + ... description=The maximum total container restarts to be still considered healthy. + ... pattern=^\d+$ + ... example=1 + ... default=1 + ${RW_LOOKBACK_WINDOW}= RW.Core.Import Platform Variable RW_LOOKBACK_WINDOW + ${RW_LOOKBACK_WINDOW}= RW.Core.Normalize Lookback Window ${RW_LOOKBACK_WINDOW} 2 + ${MAX_LOG_LINES}= RW.Core.Import User Variable MAX_LOG_LINES + ... type=string + ... description=Maximum number of log lines to fetch per container to prevent API overload. + ... pattern=^\d+$ + ... example=100 + ... default=100 + ${MAX_LOG_BYTES}= RW.Core.Import User Variable MAX_LOG_BYTES + ... type=string + ... description=Maximum log size in bytes to fetch per container to prevent API overload. + ... pattern=^\d+$ + ... example=256000 + ... default=256000 + ${EVENT_AGE}= RW.Core.Import User Variable EVENT_AGE + ... type=string + ... description=The time window to check for recent warning events. + ... pattern=((\d+?)m)? + ... example=10m + ... default=10m + ${EVENT_THRESHOLD}= RW.Core.Import User Variable EVENT_THRESHOLD + ... type=string + ... description=The maximum number of critical warning events allowed before scoring is reduced. + ... pattern=^\d+$ + ... example=2 + ... default=2 + ${CHECK_SERVICE_ENDPOINTS}= RW.Core.Import User Variable CHECK_SERVICE_ENDPOINTS + ... type=string + ... description=Whether to check service endpoint health. Set to 'false' if deployment doesn't have associated services. + ... enum=[true,false] + ... example=true + ... default=true + ${LOGS_EXCLUDE_PATTERN}= RW.Core.Import User Variable LOGS_EXCLUDE_PATTERN + ... type=string + ... description=Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. + ... pattern=.* + ... example="errors":\s*\[\]|"warnings":\s*\[\] + ... default="errors":\s*\[\]|\\bINFO\\b|\\bDEBUG\\b|\\bTRACE\\b|\\bSTART\\s*-\\s*|\\bSTART\\s*method\\b + ${EXCLUDED_CONTAINER_NAMES}= RW.Core.Import User Variable EXCLUDED_CONTAINER_NAMES + ... type=string + ... description=Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). + ... pattern=.* + ... example=linkerd-proxy,istio-proxy,vault-agent + ... default=linkerd-proxy,istio-proxy,vault-agent + + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Which binary to use for Kubernetes CLI commands. + ... enum=[kubectl,oc] + ... example=kubectl + ... default=kubectl + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTAINER_RESTART_AGE} ${CONTAINER_RESTART_AGE} + Set Suite Variable ${CONTAINER_RESTART_THRESHOLD} ${CONTAINER_RESTART_THRESHOLD} + Set Suite Variable ${RW_LOOKBACK_WINDOW} ${RW_LOOKBACK_WINDOW} + Set Suite Variable ${MAX_LOG_LINES} ${MAX_LOG_LINES} + Set Suite Variable ${MAX_LOG_BYTES} ${MAX_LOG_BYTES} + Set Suite Variable ${EVENT_AGE} ${EVENT_AGE} + Set Suite Variable ${EVENT_THRESHOLD} ${EVENT_THRESHOLD} + Set Suite Variable ${CHECK_SERVICE_ENDPOINTS} ${CHECK_SERVICE_ENDPOINTS} + Set Suite Variable ${LOGS_EXCLUDE_PATTERN} ${LOGS_EXCLUDE_PATTERN} + Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} ${EXCLUDED_CONTAINER_NAMES} + + # Convert comma-separated string to list + @{EXCLUDED_CONTAINERS_RAW}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + @{EXCLUDED_CONTAINERS}= Create List + FOR ${container} IN @{EXCLUDED_CONTAINERS_RAW} + ${trimmed_container}= Strip String ${container} + Append To List ${EXCLUDED_CONTAINERS} ${trimmed_container} + END + Set Suite Variable @{EXCLUDED_CONTAINERS} + + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${DEPLOYMENT_NAME} ${DEPLOYMENT_NAME} + Set Suite Variable ${env} {"KUBECONFIG":"./${kubeconfig.key}"} + + # Initialize score variables + Set Suite Variable ${container_restart_score} 0 + Set Suite Variable ${log_health_score} 0 + Set Suite Variable ${pods_notready_score} 0 + Set Suite Variable ${replica_score} 0 + Set Suite Variable ${events_score} 0 + + + # Check if deployment is scaled to 0 and handle appropriately + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown"), last_scale_time: (.metadata.annotations."deployment.kubernetes.io/last-applied-configuration" // "N/A")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + + TRY + ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json + ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) + + # Try to determine when deployment was scaled down by checking recent events and replica set history + ${scale_down_info}= Get Deployment Scale Down Timestamp ${spec_replicas} + + IF ${spec_replicas} == 0 + Log Deployment ${DEPLOYMENT_NAME} is scaled to 0 replicas - returning special health score + Log Scale down detected at: ${scale_down_info} + + # For scaled-down deployments, return a score of 0.5 to indicate "intentionally down" vs "broken" + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} + Set Suite Variable ${SCALED_DOWN_INFO} ${scale_down_info} + ELSE + Log Deployment ${DEPLOYMENT_NAME} has ${spec_replicas} desired replicas - proceeding with health checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + + EXCEPT + Log Warning: Failed to check deployment scale, continuing with normal health checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + +Get Deployment Scale Down Timestamp + [Arguments] ${spec_replicas} + [Documentation] Attempts to determine when a deployment was scaled down by examining recent events + ${scale_down_info}= Set Variable Unknown + + IF ${spec_replicas} == 0 + TRY + # Check recent scaling events to find when it was scaled to 0 + ${scaling_events}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.reason == "ScalingReplicaSet" and (.message | contains("${DEPLOYMENT_NAME}")) and (.message | contains("to 0"))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=15 + + IF '''${scaling_events.stdout}''' != '' + ${event_data}= Evaluate json.loads(r'''${scaling_events.stdout}''') if r'''${scaling_events.stdout}'''.strip() else {} json + ${timestamp}= Evaluate $event_data.get('timestamp', 'Unknown') + ${message}= Evaluate $event_data.get('message', 'Unknown') + ${scale_down_info}= Set Variable ${timestamp} (${message}) + Log Found scale-down event: ${scale_down_info} + ELSE + # Try checking replicaset history as fallback + ${rs_history}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get replicasets --context ${CONTEXT} -n ${NAMESPACE} -l app=${DEPLOYMENT_NAME} -o json | jq -r '.items[] | select(.spec.replicas == 0) | {creation_time: .metadata.creationTimestamp, name: .metadata.name}' | jq -s 'sort_by(.creation_time) | reverse | .[0] // empty' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=15 + + IF '''${rs_history.stdout}''' != '' + ${rs_data}= Evaluate json.loads(r'''${rs_history.stdout}''') if r'''${rs_history.stdout}'''.strip() else {} json + ${rs_time}= Evaluate $rs_data.get('creation_time', 'Unknown') + ${scale_down_info}= Set Variable Likely around ${rs_time} (based on ReplicaSet history) + Log Estimated scale-down time from ReplicaSet: ${scale_down_info} + ELSE + ${scale_down_info}= Set Variable Unable to determine - no recent scaling events found + Log Could not determine when deployment was scaled down + END + END + EXCEPT + Log Warning: Failed to determine scale-down timestamp + ${scale_down_info}= Set Variable Failed to determine scale-down time + END + END + + RETURN ${scale_down_info} + +*** Tasks *** +Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` + [Documentation] Fetches logs and checks for critical error patterns that indicate application failures. + [Tags] logs errors critical patterns + + # Skip if deployment is scaled down + IF ${SKIP_HEALTH_CHECKS} + Log Skipping log analysis - deployment is scaled to 0 replicas + ${log_health_score}= Set Variable 1 # Perfect score for scaled deployment + Set Suite Variable ${log_health_score} + RW.Core.Push Metric ${log_health_score} sub_name=log_errors + ELSE + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=deployment + ... workload_name=${DEPLOYMENT_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${RW_LOOKBACK_WINDOW} + ... max_log_lines=${MAX_LOG_LINES} + ... max_log_bytes=${MAX_LOG_BYTES} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Use only critical error patterns for fast SLI checks + @{critical_categories}= Create List GenericError AppFailure + + ${scan_results}= RW.K8sLog.Scan Logs For Issues + ... log_dir=${log_dir} + ... workload_type=deployment + ... workload_name=${DEPLOYMENT_NAME} + ... namespace=${NAMESPACE} + ... categories=${critical_categories} + ... custom_patterns_file=sli_critical_patterns.json + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN + TRY + IF $LOGS_EXCLUDE_PATTERN != "" + ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re + ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} + Set Test Variable ${scan_results} ${filtered_results} + END + EXCEPT + Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results + END + + ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} + + # Store details for final score calculation logging + TRY + ${issues}= Evaluate $scan_results.get('issues', []) + ${issue_count}= Get Length ${issues} + Set Suite Variable ${log_health_details} ${issue_count} issues found + EXCEPT + Set Suite Variable ${log_health_details} analysis completed + END + + Set Suite Variable ${log_health_score} + RW.K8sLog.Cleanup Temp Files + RW.Core.Push Metric ${log_health_score} sub_name=log_errors + END + +Generate Deployment Health Score for `${DEPLOYMENT_NAME}` + [Documentation] Generates the final applog health score and report details + [Tags] score health applog + + IF ${SKIP_HEALTH_CHECKS} + # For scaled-down deployments, return perfect score to indicate "intentionally down" vs "broken" + # We distinguish scaled-down vs broken deployments through the log message and report details + ${health_score}= Set Variable 1.0 + Log Deployment ${DEPLOYMENT_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} + RW.Core.Add to Report Applog Health Score: ${health_score} - Deployment intentionally scaled to 0 replicas + ELSE + # Use the log health score as the final health score + ${health_score}= Set Variable ${log_health_score} + + IF ${health_score} == 1.0 + RW.Core.Add to Report Applog Health Score: ${health_score} - No applog issues detected in workload logs + ELSE + RW.Core.Add to Report Applog Health Score: ${health_score} - Applog issue(s) detected in workload logs: ${log_health_details} + END + END + RW.Core.Push Metric ${health_score} \ No newline at end of file diff --git a/codebundles/k8s-applog-health/sli_critical_patterns.json b/codebundles/k8s-applog-health/sli_critical_patterns.json new file mode 100755 index 000000000..19848bb80 --- /dev/null +++ b/codebundles/k8s-applog-health/sli_critical_patterns.json @@ -0,0 +1,74 @@ +{ + "critical_patterns": { + "GenericError": { + "description": "Generic error patterns that indicate application failures", + "patterns": [ + "exception|Exception|EXCEPTION", + "fatal|Fatal|FATAL", + "panic|Panic|PANIC", + "crash|Crash|CRASH", + "failed|Failed|FAILED", + "failure|Failure|FAILURE" + ], + "severity": 1 + }, + "AppFailure": { + "description": "Application-specific failure patterns", + "patterns": [ + "application.*failed", + "service.*unavailable", + "connection.*refused", + "timeout.*error", + "out.*of.*memory", + "disk.*full", + "permission.*denied", + "authentication.*failed", + "authorization.*failed" + ], + "severity": 1 + }, + "StackTrace": { + "description": "Stack trace patterns indicating application crashes", + "patterns": [ + "stack.*trace", + "at\\s+\\w+\\.\\w+", + "Exception.*in thread", + "java\\.lang\\.", + "python.*traceback", + "goroutine.*panic", + "panic:", + "fatal.*error" + ], + "severity": 1 + } + }, + "exclude_patterns": [ + "\\bINFO\\b", + "\\bDEBUG\\b", + "\\bTRACE\\b", + "health.*check", + "heartbeat", + "metrics", + "monitoring", + "\\],INFO\\s*,", + "INFO\\s*,c\\.", + "START\\s*-\\s*.*Impl\\.", + "BusinessService.*\\(\\)", + "RestrictionsApiDelegateImpl", + "ReadBlockedResourceListBusinessServiceImpl", + "LocationReplicaStrategy", + "\\bSTART\\s*method\\b", + "Calling\\s*BusinessService", + "BusinessServiceImpl", + "ApiDelegateImpl", + "linkerd.*INFO.*Connection closed.*error=read header from client timeout", + "linkerd_app_core::serve.*Connection closed.*error=read header from client timeout", + "TelemetryPipeline.*In the last \\d+ minutes.*operation has failed.*Sending telemetry to the ingestion service", + "adjustmentReason=\\w*EXCEPTION\\w*" + ], + "config": { + "max_matches_per_pattern": 10, + "case_sensitive": false, + "timeout_seconds": 30 + } +} \ No newline at end of file From abf88d177b5830be54c1c8a5d712512c7d3a11d6 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Wed, 14 Jan 2026 16:24:33 +0530 Subject: [PATCH 02/10] Improve log analysis task naming and report formatting - Rename task to "Scan Application Logs for Errors and Stacktraces" for clarity - Add timestamp tracking for log extraction to support accurate issue reporting - Enhance log contents display with structured format showing last N lines per file - Add LOG_START/LOG_END markers for better log parsing - Use recorded timestamp when no issues are found - Re-enable log cleanup after analysis --- codebundles/k8s-applog-health/runbook.robot | 32 ++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index 88a57a05e..a3aa2301b 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -208,8 +208,8 @@ Suite Initialization *** Tasks *** -Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. +Scan Application Logs for Errors and Stacktraces for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Fetches and analyzes logs from the deployment pods for stacktraces, errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. [Tags] ... logs ... application @@ -221,6 +221,9 @@ Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespac ... access:read-only # Skip pod-related checks if deployment is scaled to 0 IF not ${SKIP_POD_CHECKS} + # record current time, and use if no issues found + ${log_extraction_timestamp}= DateTime.Get Current Date + # Temporarily suppress log warnings for excluded containers (they're expected) TRY ${log_dir}= RW.K8sLog.Fetch Workload Logs @@ -271,9 +274,30 @@ Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespac ${issues_count}= Get Length ${issues} # print the contents from log_dir into the report - RW.Core.Add Pre To Report **Log Contents:**\n${log_dir} + ${logs_subdir}= Set Variable ${log_dir}${/}deployment_${DEPLOYMENT_NAME}_logs + ${has_logs_dir}= Run Keyword And Return Status Directory Should Exist ${logs_subdir} + + IF ${has_logs_dir} + @{log_files}= List Files In Directory ${logs_subdir} pattern=*_logs.txt absolute=True + Sort List ${log_files} + + RW.Core.Add Pre To Report **Log Contents (showing last ${LOG_LINES} lines per file)** + + FOR ${log_file} IN @{log_files} + ${base}= Evaluate __import__('os').path.basename(r'''${log_file}''') + + # Efficient-ish tail in Python: keeps only last N lines + ${tail}= Evaluate ''.join(__import__('collections').deque(open(r'''${log_file}''', 'r', encoding='utf-8', errors='replace'), maxlen=int('${LOG_LINES}'))) + + RW.Core.Add Pre To Report [LOG_START: ${base}]\n${tail}\n[LOG_END: ${base}]\n + END + ELSE + RW.Core.Add Pre To Report **Log Contents:**\nNo log files directory found at: ${logs_subdir} + END IF ${issues_count} == 0 + ${issue_timestamp}= Set Variable ${log_extraction_timestamp} + # create a dummy issue with a keyword argument set to a value depicting no issues found RW.Core.Add Pre To Report **No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`** @@ -304,5 +328,5 @@ Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespac ... observed_at=${issue_timestamp} ... next_action=processApplogIssues END - # RW.K8sLog.Cleanup Temp Files + RW.K8sLog.Cleanup Temp Files END \ No newline at end of file From 8f8d391f747db97588b2cb0ba2de8300ebdcf920 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Thu, 15 Jan 2026 14:02:00 +0000 Subject: [PATCH 03/10] Update display name from Kubernetes Deployment Triage to Kubernetes AppLog Analysis --- codebundles/k8s-applog-health/runbook.robot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index a3aa2301b..d98b0523d 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -1,7 +1,7 @@ *** Settings *** Documentation Triages issues related to a deployment and its replicas. Metadata Author stewartshea -Metadata Display Name Kubernetes Deployment Triage +Metadata Display Name Kubernetes AppLog Analysis Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift Library BuiltIn From 532ff8e1ebcb7f30f03746a1cd749d48c54cef40 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Fri, 16 Jan 2026 19:12:03 +0530 Subject: [PATCH 04/10] Add log size limits and clean up unused SLI variables - Add LOG_SIZE/MAX_LOG_SIZE variable to control max log bytes fetched (default 2MB) - Increase default LOG_LINES from 100 to 1000 for better log coverage - Pass max_log_lines and max_log_bytes to log fetching in runbook - Remove unused EVENT_AGE, EVENT_THRESHOLD, CHECK_SERVICE_ENDPOINTS from SLI - Remove unnecessary "no issues found" dummy issue creation - Rename final SLI task to "Generate Application Health Score" --- .../templates/k8s-applog-health-sli.yaml | 4 +++ .../templates/k8s-applog-health-taskset.yaml | 4 +++ codebundles/k8s-applog-health/runbook.robot | 23 +++++++-------- codebundles/k8s-applog-health/sli.robot | 29 +++---------------- 4 files changed, 22 insertions(+), 38 deletions(-) diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml index 8576f2825..c4e813650 100755 --- a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml @@ -46,6 +46,10 @@ spec: value: "2" - name: CHECK_SERVICE_ENDPOINTS value: "true" + - name: MAX_LOG_LINES + value: "1000" + - name: MAX_LOG_SIZE + value: "2097152" secretsProvided: {% if wb_version %} {% include "kubernetes-auth.yaml" ignore missing %} diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml index 7f1d9b79e..73229c8e6 100644 --- a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml @@ -35,6 +35,10 @@ spec: value: "4" - name: LOG_AGE value: "10m" + - name: LOG_SIZE + value: "2097152" + - name: LOG_LINES + value: "1000" secretsProvided: {% if wb_version %} {% include "kubernetes-auth.yaml" ignore missing %} diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index d98b0523d..91f6c3fdd 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -55,13 +55,19 @@ Suite Initialization ... description=The number of log lines to fetch from the pods when inspecting logs. ... pattern=\d+ ... example=100 - ... default=100 + ... default=1000 ${LOG_AGE}= RW.Core.Import User Variable LOG_AGE ... type=string ... description=The age of logs to fetch from pods, used for log analysis tasks. ... pattern=\w* ... example=10m ... default=10m + ${LOG_SIZE}= RW.Core.Import User Variable LOG_SIZE + ... type=string + ... description=The maximum size of logs in bytes to fetch from pods, used for log analysis tasks. Defaults to 2MB. + ... pattern=\d* + ... example=1024 + ... default=2097152 ${LOG_ANALYSIS_DEPTH}= RW.Core.Import User Variable LOG_ANALYSIS_DEPTH ... type=string @@ -141,6 +147,7 @@ Suite Initialization Set Suite Variable ${DEPLOYMENT_NAME} Set Suite Variable ${LOG_LINES} Set Suite Variable ${LOG_AGE} + Set Suite Variable ${LOG_SIZE} Set Suite Variable ${LOG_ANALYSIS_DEPTH} Set Suite Variable ${LOG_SEVERITY_THRESHOLD} @@ -233,6 +240,8 @@ Scan Application Logs for Errors and Stacktraces for Deployment `${DEPLOYMENT_NA ... context=${CONTEXT} ... kubeconfig=${kubeconfig} ... log_age=${LOG_AGE} + ... max_log_lines=${LOG_LINES} + ... max_log_bytes=${LOG_SIZE} ... excluded_containers=${EXCLUDED_CONTAINERS} EXCEPT AS ${log_error} # If log fetching fails completely, log the error but continue @@ -300,18 +309,6 @@ Scan Application Logs for Errors and Stacktraces for Deployment `${DEPLOYMENT_NA # create a dummy issue with a keyword argument set to a value depicting no issues found RW.Core.Add Pre To Report **No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`** - - # create a dummy issue with a keyword argument set to a value depicting no issues found - RW.Core.Add Issue - ... severity=4 - ... expected=Application logs should be free of critical errors for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... actual=No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... title=No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... reproduce_hint=Check application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... details=No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... next_steps=No processing required - ... observed_at=${issue_timestamp} - ... next_action=noIssuesFound ELSE # set issue_timestamp to the observed_at value from the first issue ${issue_timestamp}= Evaluate $issues[0].get('observed_at', '') diff --git a/codebundles/k8s-applog-health/sli.robot b/codebundles/k8s-applog-health/sli.robot index 496903cd8..3c105a100 100755 --- a/codebundles/k8s-applog-health/sli.robot +++ b/codebundles/k8s-applog-health/sli.robot @@ -1,7 +1,7 @@ *** Settings *** Metadata Author stewartshea -Documentation This SLI uses kubectl to score deployment health. Produces a value between 0 (completely failing the test) and 1 (fully passing the test). Looks for container restarts, critical log errors, pods not ready, deployment status, and recent events. -Metadata Display Name Kubernetes Deployment Healthcheck +Documentation This SLI uses kubectl to score application log health. Produces a value between 0 (completely failing the test) and 1 (fully passing the test). Looks for container restarts, critical log errors, pods not ready, deployment status, stacktraces and other recent events. +Metadata Display Name Kubernetes Application Log Healthcheck Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift Suite Setup Suite Initialization Library BuiltIn @@ -55,31 +55,13 @@ Suite Initialization ... description=Maximum number of log lines to fetch per container to prevent API overload. ... pattern=^\d+$ ... example=100 - ... default=100 + ... default=1000 ${MAX_LOG_BYTES}= RW.Core.Import User Variable MAX_LOG_BYTES ... type=string ... description=Maximum log size in bytes to fetch per container to prevent API overload. ... pattern=^\d+$ ... example=256000 ... default=256000 - ${EVENT_AGE}= RW.Core.Import User Variable EVENT_AGE - ... type=string - ... description=The time window to check for recent warning events. - ... pattern=((\d+?)m)? - ... example=10m - ... default=10m - ${EVENT_THRESHOLD}= RW.Core.Import User Variable EVENT_THRESHOLD - ... type=string - ... description=The maximum number of critical warning events allowed before scoring is reduced. - ... pattern=^\d+$ - ... example=2 - ... default=2 - ${CHECK_SERVICE_ENDPOINTS}= RW.Core.Import User Variable CHECK_SERVICE_ENDPOINTS - ... type=string - ... description=Whether to check service endpoint health. Set to 'false' if deployment doesn't have associated services. - ... enum=[true,false] - ... example=true - ... default=true ${LOGS_EXCLUDE_PATTERN}= RW.Core.Import User Variable LOGS_EXCLUDE_PATTERN ... type=string ... description=Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. @@ -106,9 +88,6 @@ Suite Initialization Set Suite Variable ${RW_LOOKBACK_WINDOW} ${RW_LOOKBACK_WINDOW} Set Suite Variable ${MAX_LOG_LINES} ${MAX_LOG_LINES} Set Suite Variable ${MAX_LOG_BYTES} ${MAX_LOG_BYTES} - Set Suite Variable ${EVENT_AGE} ${EVENT_AGE} - Set Suite Variable ${EVENT_THRESHOLD} ${EVENT_THRESHOLD} - Set Suite Variable ${CHECK_SERVICE_ENDPOINTS} ${CHECK_SERVICE_ENDPOINTS} Set Suite Variable ${LOGS_EXCLUDE_PATTERN} ${LOGS_EXCLUDE_PATTERN} Set Suite Variable ${EXCLUDED_CONTAINER_NAMES} ${EXCLUDED_CONTAINER_NAMES} @@ -273,7 +252,7 @@ Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` RW.Core.Push Metric ${log_health_score} sub_name=log_errors END -Generate Deployment Health Score for `${DEPLOYMENT_NAME}` +Generate Application Health Score for `${DEPLOYMENT_NAME}` [Documentation] Generates the final applog health score and report details [Tags] score health applog From 2591933601c5419b75164e4dcc63d2991e74b396 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Wed, 11 Feb 2026 09:56:12 +0000 Subject: [PATCH 05/10] shifted "Analyze applog " task from healthcheck to applog-health CB; shifted "Analyze workload stacktraces" task from stacktrace-CB to applog-health-CB --- codebundles/k8s-applog-health/runbook.robot | 215 ++++++++++++------ codebundles/k8s-applog-health/sli.robot | 109 +++++++-- .../k8s-daemonset-healthcheck/runbook.robot | 58 ----- .../k8s-deployment-healthcheck/runbook.robot | 99 -------- .../k8s-statefulset-healthcheck/runbook.robot | 59 ----- 5 files changed, 229 insertions(+), 311 deletions(-) diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index 91f6c3fdd..299757e56 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -1,6 +1,6 @@ *** Settings *** Documentation Triages issues related to a deployment and its replicas. -Metadata Author stewartshea +Metadata Author akshayrw25 Metadata Display Name Kubernetes AppLog Analysis Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift @@ -11,6 +11,7 @@ Library RW.platform Library RW.NextSteps Library RW.K8sHelper Library RW.K8sLog +Library RW.LogAnalysis.ExtractTraceback Library OperatingSystem Library String @@ -45,11 +46,18 @@ Suite Initialization ... description=The name of the Kubernetes namespace to scope actions and searching to. ... pattern=\w* ... example=otel-demo - ${DEPLOYMENT_NAME}= RW.Core.Import User Variable DEPLOYMENT_NAME + ${WORKLOAD_NAME}= RW.Core.Import User Variable WORKLOAD_NAME ... type=string - ... description=The name of the deployment to triage. + ... description=The name of the workload (deployment, statefulset, or daemonset) to analyze for stacktraces. ... pattern=\w* ... example=otel-demo-frontend + ${WORKLOAD_TYPE}= RW.Core.Import User Variable WORKLOAD_TYPE + ... type=string + ... description=The type of Kubernetes workload to analyze. + ... pattern=\w* + ... enum=[deployment,statefulset,daemonset] + ... example=deployment + ... default=deployment ${LOG_LINES}= RW.Core.Import User Variable LOG_LINES ... type=string ... description=The number of log lines to fetch from the pods when inspecting logs. @@ -104,8 +112,8 @@ Suite Initialization ... type=string ... description=Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. ... pattern=.* - ... example="errors":\s*\[\]|"warnings":\s*\[\] - ... default="errors":\\s*\\[\\]|\\bINFO\\b|\\bDEBUG\\b|\\bTRACE\\b|\\bSTART\\s*-\\s*|\\bSTART\\s*method\\b + ... example="errors":\\s*\\[\\]|"warnings":\\s*\\[\\] + ... default="errors":\\\\s*\\\\[\\\\]|\\\\bINFO\\\\b|\\\\bDEBUG\\\\b|\\\\bTRACE\\\\b|\\\\bSTART\\\\s*-\\\\s*|\\\\bSTART\\\\s*method\\\\b ${LOG_SCAN_TIMEOUT}= RW.Core.Import User Variable LOG_SCAN_TIMEOUT ... type=string ... description=Timeout in seconds for log scanning operations. Increase this value if log scanning times out on large log files. @@ -144,7 +152,8 @@ Suite Initialization Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} Set Suite Variable ${CONTEXT} Set Suite Variable ${NAMESPACE} - Set Suite Variable ${DEPLOYMENT_NAME} + Set Suite Variable ${WORKLOAD_NAME} + Set Suite Variable ${WORKLOAD_TYPE} Set Suite Variable ${LOG_LINES} Set Suite Variable ${LOG_AGE} Set Suite Variable ${LOG_SIZE} @@ -171,15 +180,17 @@ Suite Initialization ... LOGS_ERROR_PATTERN=${LOGS_ERROR_PATTERN} ... LOGS_EXCLUDE_PATTERN=${LOGS_EXCLUDE_PATTERN} ... ANOMALY_THRESHOLD=${ANOMALY_THRESHOLD} - ... DEPLOYMENT_NAME=${DEPLOYMENT_NAME} + # ... DEPLOYMENT_NAME=${DEPLOYMENT_NAME} + ... WORKLOAD_NAME=${WORKLOAD_NAME} + ... WORKLOAD_TYPE=${WORKLOAD_TYPE} ... CONTAINER_RESTART_AGE=${CONTAINER_RESTART_AGE} ... CONTAINER_RESTART_THRESHOLD=${CONTAINER_RESTART_THRESHOLD} ... LOG_SCAN_TIMEOUT=${LOG_SCAN_TIMEOUT} Set Suite Variable ${env} ${env_dict} - # Check if deployment is scaled to 0 and handle appropriately + # Check if the workload is scaled to 0 and handle appropriately ${scale_check}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... timeout_seconds=30 @@ -188,19 +199,21 @@ Suite Initialization ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) - IF ${spec_replicas} == 0 - ${issue_timestamp}= DateTime.Get Current Date + # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them + IF '${WORKLOAD_TYPE}' == 'daemonset' + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log analysis + Set Suite Variable ${SKIP_POD_CHECKS} ${False} + ELSE IF ${spec_replicas} == 0 RW.Core.Add Issue ... severity=4 - ... expected=Deployment `${DEPLOYMENT_NAME}` operational status documented - ... actual=Deployment `${DEPLOYMENT_NAME}` is intentionally scaled to zero replicas - ... title=Deployment `${DEPLOYMENT_NAME}` is Scaled Down (Informational) - ... reproduce_hint=kubectl get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o yaml - ... details=Deployment `${DEPLOYMENT_NAME}` is currently scaled to 0 replicas (spec.replicas=0). This is an intentional configuration and not an error. All pod-related healthchecks have been skipped for efficiency. If the deployment should be running, scale it up using:\nkubectl scale deployment/${DEPLOYMENT_NAME} --replicas= --context ${CONTEXT} -n ${NAMESPACE} - ... next_steps=This is informational only. If the deployment should be running, scale it up. - ... observed_at=${issue_timestamp} + ... expected=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` operational status documented + ... actual=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is intentionally scaled to zero replicas + ... title=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is Scaled Down (Informational) + ... reproduce_hint=kubectl get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o yaml + ... details=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is currently scaled to 0 replicas (spec.replicas=0). This is an intentional configuration and not an error. All pod-related healthchecks have been skipped for efficiency. If the workload should be running, scale it up using:\nkubectl scale ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --replicas= --context ${CONTEXT} -n ${NAMESPACE} + ... next_steps=This is informational only. If the workload should be running, scale it up. - RW.Core.Add Pre To Report **ā„¹ļø Deployment `${DEPLOYMENT_NAME}` is scaled to 0 replicas - Skipping pod-related checks**\n**Available Condition:** ${scale_status.get('available_condition', 'Unknown')} + RW.Core.Add Pre To Report **ā„¹ļø ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is scaled to 0 replicas - Skipping log analysis**\n**Available Condition:** ${scale_status.get('available_condition', 'Unknown')} Set Suite Variable ${SKIP_POD_CHECKS} ${True} ELSE @@ -208,54 +221,54 @@ Suite Initialization END EXCEPT - Log Warning: Failed to check deployment scale, continuing with normal checks + Log Warning: Failed to check workload scale, continuing with normal checks Set Suite Variable ${SKIP_POD_CHECKS} ${False} END *** Tasks *** -Scan Application Logs for Errors and Stacktraces for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the deployment pods for stacktraces, errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. + +Analyze Application Log Patterns for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. [Tags] ... logs ... application ... errors - ... stacktrace ... patterns ... health - ... deployment + ... ${WORKLOAD_TYPE} ... access:read-only # Skip pod-related checks if deployment is scaled to 0 IF not ${SKIP_POD_CHECKS} - # record current time, and use if no issues found - ${log_extraction_timestamp}= DateTime.Get Current Date - # Temporarily suppress log warnings for excluded containers (they're expected) TRY ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} ... namespace=${NAMESPACE} ... context=${CONTEXT} ... kubeconfig=${kubeconfig} ... log_age=${LOG_AGE} - ... max_log_lines=${LOG_LINES} - ... max_log_bytes=${LOG_SIZE} ... excluded_containers=${EXCLUDED_CONTAINERS} EXCEPT AS ${log_error} # If log fetching fails completely, log the error but continue Log Warning: Log fetching encountered an error: ${log_error} + + # TODO: remove this after testing + RW.Core.Add Pre To Report **Log Fetching Error:** ${log_error} # Set empty log directory to continue with other checks ${log_dir}= Set Variable ${EMPTY} END + + RW.Core.Add Pre To Report **Log Directory:** ${log_dir} # Only scan logs if we have a valid log directory IF '''${log_dir}''' != '''${EMPTY}''' ${scan_results}= RW.K8sLog.Scan Logs For Issues ... log_dir=${log_dir} - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} ... namespace=${NAMESPACE} ... categories=@{LOG_PATTERN_CATEGORIES} ... custom_patterns_file=runbook_patterns.json @@ -280,50 +293,108 @@ Scan Application Logs for Errors and Stacktraces for Deployment `${DEPLOYMENT_NA # Process each issue found in the logs ${issues}= Evaluate $scan_results.get('issues', []) - ${issues_count}= Get Length ${issues} - - # print the contents from log_dir into the report - ${logs_subdir}= Set Variable ${log_dir}${/}deployment_${DEPLOYMENT_NAME}_logs - ${has_logs_dir}= Run Keyword And Return Status Directory Should Exist ${logs_subdir} + FOR ${issue} IN @{issues} + ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) + IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} + # Convert issue details to string to avoid serialization issues + ${issue_details_raw}= Evaluate $issue.get("details", "") + ${issue_details_str}= Convert To String ${issue_details_raw} + ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue_details_str} + + # Safely extract title and next_steps as strings + ${issue_title_raw}= Evaluate $issue.get('title', 'Log pattern issue detected') + ${issue_title}= Convert To String ${issue_title_raw} + ${next_steps_raw}= Evaluate $issue.get('next_steps', 'Review application logs and resolve underlying issues') + ${next_steps}= Convert To String ${next_steps_raw} + + # Use timestamp from log scan results if available, otherwise extract from details + ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - IF ${has_logs_dir} - @{log_files}= List Files In Directory ${logs_subdir} pattern=*_logs.txt absolute=True - Sort List ${log_files} - - RW.Core.Add Pre To Report **Log Contents (showing last ${LOG_LINES} lines per file)** - - FOR ${log_file} IN @{log_files} - ${base}= Evaluate __import__('os').path.basename(r'''${log_file}''') - - # Efficient-ish tail in Python: keeps only last N lines - ${tail}= Evaluate ''.join(__import__('collections').deque(open(r'''${log_file}''', 'r', encoding='utf-8', errors='replace'), maxlen=int('${LOG_LINES}'))) - - RW.Core.Add Pre To Report [LOG_START: ${base}]\n${tail}\n[LOG_END: ${base}]\n + RW.Core.Add Issue + ... severity=${severity} + ... expected=Application logs should be free of critical errors for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... actual=${issue_title} in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... title=${issue_title} in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + ... reproduce_hint=Check application logs for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... details=${summarized_details} + ... next_steps=${next_steps} + ... observed_at=${issue_timestamp} END - ELSE - RW.Core.Add Pre To Report **Log Contents:**\nNo log files directory found at: ${logs_subdir} END - IF ${issues_count} == 0 - ${issue_timestamp}= Set Variable ${log_extraction_timestamp} + ${issues_count}= Get Length ${issues} + + # Convert scan_results to string to avoid serialization issues, then format for display + ${scan_results_str}= Evaluate json.dumps($scan_results, indent=2) json + ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results_str} + + RW.Core.Add Pre To Report **Log Analysis Summary for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} + + RW.K8sLog.Cleanup Temp Files + END + +Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Collects and analyzes stacktraces/tracebacks from all pods in the workload for troubleshooting application issues. + [Tags] + ... logs + ... stacktraces + ... tracebacks + ... workload + ... troubleshooting + ... errors + ... access:read-only + # Skip pod-related checks if workload is scaled to 0 + IF not ${SKIP_STACKTRACE_CHECKS} + # Convert comma-separated string to list for excluded containers + @{EXCLUDED_CONTAINERS}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + + # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${LOG_AGE} + ... max_log_lines=${LOG_LINES} + ... max_log_bytes=${LOG_SIZE} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Extract stacktraces from the log directory using the traceback library + ${tracebacks}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks + ... logs_dir=${log_dir} + + # Check total number of tracebacks extracted + ${total_tracebacks}= Get Length ${tracebacks} + + IF ${total_tracebacks} == 0 + # No tracebacks found + RW.Core.Add Pre To Report **šŸ“‹ No Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\nLog analysis completed successfully with no stacktraces detected. + ELSE + # Stacktraces found - create issues for each one + ${delimiter}= Evaluate '-' * 80 - # create a dummy issue with a keyword argument set to a value depicting no issues found - RW.Core.Add Pre To Report **No issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}`** - ELSE - # set issue_timestamp to the observed_at value from the first issue - ${issue_timestamp}= Evaluate $issues[0].get('observed_at', '') + FOR ${traceback} IN @{tracebacks} + ${stacktrace}= Set Variable ${traceback["stacktrace"]} + ${timestamp}= Set Variable ${traceback["timestamp"]} + RW.Core.Add Issue + ... severity=2 + ... expected=No stacktraces should be present in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` + ... actual=Stacktrace detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` + ... title=Stacktrace Detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + ... reproduce_hint=Check application logs for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... details=${delimiter}\n${stacktrace}\n${delimiter} + ... next_steps=Review application logs for the root cause of the stacktrace\nCheck application configuration and resource limits\nInvestigate the specific error conditions that led to this stacktrace\nConsider scaling or restarting the ${WORKLOAD_TYPE} if issues persist\nMonitor application health and performance metrics + ... next_action=analyseStacktrace + ... observed_at=${timestamp} + END - # create a dummy issue with a keyword argument set to a value depicting issues found - RW.Core.Add Issue - ... severity=4 - ... expected=Application logs should be free of critical errors for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... actual=Issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... title=Issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... reproduce_hint=Check application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... details=Issues found in application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... next_steps=Process the issues found in the application logs - ... observed_at=${issue_timestamp} - ... next_action=processApplogIssues - END + # Create consolidated report showing all stacktraces + ${stacktrace_strings}= Evaluate [tb["stacktrace"] for tb in ${tracebacks}] + ${agg_tracebacks}= Evaluate "\\n" + "\\n${delimiter}\\n".join(${stacktrace_strings}) + RW.Core.Add Pre To Report **šŸ” Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Total Stacktraces:** ${total_tracebacks}\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\n${agg_tracebacks} + END + + # Clean up temporary log files RW.K8sLog.Cleanup Temp Files END \ No newline at end of file diff --git a/codebundles/k8s-applog-health/sli.robot b/codebundles/k8s-applog-health/sli.robot index 3c105a100..d9a4d1f37 100755 --- a/codebundles/k8s-applog-health/sli.robot +++ b/codebundles/k8s-applog-health/sli.robot @@ -1,5 +1,5 @@ *** Settings *** -Metadata Author stewartshea +Metadata Author akshayrw25 Documentation This SLI uses kubectl to score application log health. Produces a value between 0 (completely failing the test) and 1 (fully passing the test). Looks for container restarts, critical log errors, pods not ready, deployment status, stacktraces and other recent events. Metadata Display Name Kubernetes Application Log Healthcheck Metadata Supports Kubernetes,AKS,EKS,GKE,OpenShift @@ -8,6 +8,7 @@ Library BuiltIn Library RW.Core Library RW.CLI Library RW.platform +Library RW.LogAnalysis.ExtractTraceback Library RW.K8sLog Library OperatingSystem @@ -31,11 +32,18 @@ Suite Initialization ... description=Which Kubernetes context to operate within. ... pattern=\w* ... example=my-main-cluster - ${DEPLOYMENT_NAME}= RW.Core.Import User Variable DEPLOYMENT_NAME + ${WORKLOAD_TYPE}= RW.Core.Import User Variable WORKLOAD_TYPE ... type=string - ... description=The name of the Kubernetes deployment to check. + ... description=The type of Kubernetes workload to analyze. ... pattern=\w* - ... example=my-deployment + ... enum=[deployment,statefulset,daemonset] + ... example=deployment + ... default=deployment + ${WORKLOAD_NAME}= RW.Core.Import User Variable WORKLOAD_NAME + ... type=string + ... description=The name of the Kubernetes workload to check. + ... pattern=\w* + ... example=my-workload ${CONTAINER_RESTART_AGE}= RW.Core.Import User Variable CONTAINER_RESTART_AGE ... type=string ... description=The time window in minutes to search for container restarts. @@ -102,7 +110,8 @@ Suite Initialization Set Suite Variable ${CONTEXT} ${CONTEXT} Set Suite Variable ${NAMESPACE} ${NAMESPACE} - Set Suite Variable ${DEPLOYMENT_NAME} ${DEPLOYMENT_NAME} + Set Suite Variable ${WORKLOAD_NAME} ${WORKLOAD_NAME} + Set Suite Variable ${WORKLOAD_TYPE} ${WORKLOAD_TYPE} Set Suite Variable ${env} {"KUBECONFIG":"./${kubeconfig.key}"} # Initialize score variables @@ -113,34 +122,43 @@ Suite Initialization Set Suite Variable ${events_score} 0 - # Check if deployment is scaled to 0 and handle appropriately - ${scale_check}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown"), last_scale_time: (.metadata.annotations."deployment.kubernetes.io/last-applied-configuration" // "N/A")}' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... timeout_seconds=30 + # Check if workload is scaled to 0 and handle appropriately + # Different workload types have different field structures + IF '${WORKLOAD_TYPE}' == 'daemonset' + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .status.desiredNumberScheduled, ready_replicas: (.status.numberReady // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + ELSE + # For deployments and statefulsets + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + END TRY ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) - # Try to determine when deployment was scaled down by checking recent events and replica set history - ${scale_down_info}= Get Deployment Scale Down Timestamp ${spec_replicas} - - IF ${spec_replicas} == 0 - Log Deployment ${DEPLOYMENT_NAME} is scaled to 0 replicas - returning special health score - Log Scale down detected at: ${scale_down_info} + # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them + IF '${WORKLOAD_TYPE}' == 'daemonset' + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with stacktrace checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + ELSE IF ${spec_replicas} == 0 + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas - returning perfect health score - # For scaled-down deployments, return a score of 0.5 to indicate "intentionally down" vs "broken" + # For scaled-down workloads, return a score of 1.0 to indicate "intentionally down" vs "broken" Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} - Set Suite Variable ${SCALED_DOWN_INFO} ${scale_down_info} ELSE - Log Deployment ${DEPLOYMENT_NAME} has ${spec_replicas} desired replicas - proceeding with health checks + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with stacktrace checks Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} END EXCEPT - Log Warning: Failed to check deployment scale, continuing with normal health checks + Log Warning: Failed to check workload scale, continuing with normal stacktrace checks Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} END @@ -252,6 +270,51 @@ Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` RW.Core.Push Metric ${log_health_score} sub_name=log_errors END +Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + [Documentation] Checks for recent stacktraces/tracebacks related to the workload within a short time window, with filtering to reduce noise. + [Tags] stacktraces tracebacks errors recent fast + IF ${SKIP_HEALTH_CHECKS} + # For scaled-down deployments, return perfect score to indicate "intentionally down" vs "broken" + ${stacktrace_score}= Set Variable 1.0 + Set Suite Variable ${stacktrace_details} ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is intentionally scaled to 0 replicas - Score: ${stacktrace_score} + ELSE + # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${RW_LOOKBACK_WINDOW} + ... max_log_lines=${MAX_LOG_LINES} + ... max_log_bytes=${MAX_LOG_BYTES} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Extract stacktraces from the log directory + ${recentmost_stacktrace}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks + ... logs_dir=${log_dir} + ... fast_exit=${True} + + ${stacktrace_length}= Get Length ${recentmost_stacktrace} + + IF ${stacktrace_length} != 0 + # Stacktrace found - set score to 0 + ${stacktrace_score}= Set Variable 0 + ${delimiter}= Evaluate '-' * 150 + Set Suite Variable ${stacktrace_details} **Stacktrace(s) identified**:\n${delimiter}\n${recentmost_stacktrace}\n${delimiter} + ELSE + # No stacktraces found - set score to 1 + ${stacktrace_score}= Set Variable 1.0 + Set Suite Variable ${stacktrace_details} **No Stacktraces identified.**\n\nLog analysis completed successfully. + END + + # Clean up temporary log files + RW.K8sLog.Cleanup Temp Files + END + + Set Suite Variable ${stacktrace_score} + RW.Core.Push Metric ${stacktrace_score} sub_name=stacktrace_score + Generate Application Health Score for `${DEPLOYMENT_NAME}` [Documentation] Generates the final applog health score and report details [Tags] score health applog @@ -263,8 +326,8 @@ Generate Application Health Score for `${DEPLOYMENT_NAME}` Log Deployment ${DEPLOYMENT_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} RW.Core.Add to Report Applog Health Score: ${health_score} - Deployment intentionally scaled to 0 replicas ELSE - # Use the log health score as the final health score - ${health_score}= Set Variable ${log_health_score} + # Use the higher of log health score and stacktrace score as the final health score + ${health_score}= Evaluate max(${log_health_score}, ${stacktrace_score}) IF ${health_score} == 1.0 RW.Core.Add to Report Applog Health Score: ${health_score} - No applog issues detected in workload logs diff --git a/codebundles/k8s-daemonset-healthcheck/runbook.robot b/codebundles/k8s-daemonset-healthcheck/runbook.robot index 8233cb0a9..c6f0d3d29 100644 --- a/codebundles/k8s-daemonset-healthcheck/runbook.robot +++ b/codebundles/k8s-daemonset-healthcheck/runbook.robot @@ -20,64 +20,6 @@ Suite Setup Suite Initialization *** Tasks *** -Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the DaemonSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems. - [Tags] - ... logs - ... application - ... errors - ... patterns - ... health - ... daemonset - ... stacktrace - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - ... categories=@{LOG_PATTERN_CATEGORIES} - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Process each issue found in the logs - ${issues}= Evaluate $scan_results.get('issues', []) - FOR ${issue} IN @{issues} - ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) - IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} - # Use the full issue details directly without summarization to preserve all log content - ${issue_details_raw}= Evaluate $issue.get("details", "") - ${issue_details_str}= Convert To String ${issue_details_raw} - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${severity} - ... expected=Application logs should be free of critical errors for daemonset `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... actual=${issue.get('title', 'Log pattern issue detected')} in daemonset `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue.get('title', 'Log Pattern Issue')} in DaemonSet `${DAEMONSET_NAME}` - ... reproduce_hint=Check application logs for daemonset `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... details=${issue_details_str} - ... next_steps=${issue.get('next_steps', 'Review application logs and resolve underlying issues')} - ... observed_at=${issue_timestamp} - END - END - - ${issues_count}= Get Length ${issues} - - # Format scan results for better display - ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results} - - RW.Core.Add Pre To Report **Log Analysis Summary for DaemonSet `${DAEMONSET_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} - - RW.K8sLog.Cleanup Temp Files Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` [Documentation] Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot index cde66123b..cb47daaa7 100755 --- a/codebundles/k8s-deployment-healthcheck/runbook.robot +++ b/codebundles/k8s-deployment-healthcheck/runbook.robot @@ -215,105 +215,6 @@ Suite Initialization *** Tasks *** -Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. - [Tags] - ... logs - ... application - ... errors - ... patterns - ... health - ... deployment - ... access:read-only - # Skip pod-related checks if deployment is scaled to 0 - IF not ${SKIP_POD_CHECKS} - # Temporarily suppress log warnings for excluded containers (they're expected) - TRY - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - ... excluded_containers=${EXCLUDED_CONTAINERS} - EXCEPT AS ${log_error} - # If log fetching fails completely, log the error but continue - Log Warning: Log fetching encountered an error: ${log_error} - # Set empty log directory to continue with other checks - ${log_dir}= Set Variable ${EMPTY} - END - - # Only scan logs if we have a valid log directory - IF '''${log_dir}''' != '''${EMPTY}''' - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... categories=@{LOG_PATTERN_CATEGORIES} - ... custom_patterns_file=runbook_patterns.json - ... excluded_containers=${EXCLUDED_CONTAINERS} - ELSE - # Create empty scan results if no logs were fetched - ${scan_results}= Evaluate {"issues": [], "summary": ["No logs available for analysis"]} - END - - # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN - TRY - IF $LOGS_EXCLUDE_PATTERN != "" - ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re - ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} - Set Test Variable ${scan_results} ${filtered_results} - END - EXCEPT - Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results - END - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Process each issue found in the logs - ${issues}= Evaluate $scan_results.get('issues', []) - FOR ${issue} IN @{issues} - ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) - IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} - # Convert issue details to string to avoid serialization issues - ${issue_details_raw}= Evaluate $issue.get("details", "") - ${issue_details_str}= Convert To String ${issue_details_raw} - ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue_details_str} - - # Safely extract title and next_steps as strings - ${issue_title_raw}= Evaluate $issue.get('title', 'Log pattern issue detected') - ${issue_title}= Convert To String ${issue_title_raw} - ${next_steps_raw}= Evaluate $issue.get('next_steps', 'Review application logs and resolve underlying issues') - ${next_steps}= Convert To String ${next_steps_raw} - - # Use timestamp from log scan results if available, otherwise extract from details - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${severity} - ... expected=Application logs should be free of critical errors for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... actual=${issue_title} in deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... title=${issue_title} in Deployment `${DEPLOYMENT_NAME}` - ... reproduce_hint=Check application logs for deployment `${DEPLOYMENT_NAME}` in namespace `${NAMESPACE}` - ... details=${summarized_details} - ... next_steps=${next_steps} - ... observed_at=${issue_timestamp} - END - END - - ${issues_count}= Get Length ${issues} - - # Convert scan_results to string to avoid serialization issues, then format for display - ${scan_results_str}= Evaluate json.dumps($scan_results, indent=2) json - ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results_str} - - RW.Core.Add Pre To Report **Log Analysis Summary for Deployment `${DEPLOYMENT_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} - - RW.K8sLog.Cleanup Temp Files - END - Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` [Documentation] Analyzes Kubernetes event patterns to identify anomalies such as sudden spikes in event rates, unusual patterns, or recurring issues that might indicate underlying problems with controllers, resources, or deployments. [Tags] diff --git a/codebundles/k8s-statefulset-healthcheck/runbook.robot b/codebundles/k8s-statefulset-healthcheck/runbook.robot index b5394e3fe..84738e855 100644 --- a/codebundles/k8s-statefulset-healthcheck/runbook.robot +++ b/codebundles/k8s-statefulset-healthcheck/runbook.robot @@ -20,65 +20,6 @@ Suite Setup Suite Initialization *** Tasks *** -Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and analyzes logs from the StatefulSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems. - [Tags] - ... logs - ... application - ... errors - ... patterns - ... health - ... statefulset - ... stacktrace - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - ... categories=@{LOG_PATTERN_CATEGORIES} - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Process each issue found in the logs - ${issues}= Evaluate $scan_results.get('issues', []) - FOR ${issue} IN @{issues} - ${severity}= Evaluate $issue.get('severity', ${LOG_SEVERITY_THRESHOLD}) - IF ${severity} <= ${LOG_SEVERITY_THRESHOLD} - # Use the full issue details directly without summarization to preserve all log content - ${issue_details_raw}= Evaluate $issue.get("details", "") - ${issue_details_str}= Convert To String ${issue_details_raw} - # Use timestamp from log scan results if available, otherwise extract from details - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${severity} - ... expected=Application logs should be free of critical errors for statefulset `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... actual=${issue.get('title', 'Log pattern issue detected')} in statefulset `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue.get('title', 'Log Pattern Issue')} in StatefulSet `${STATEFULSET_NAME}` - ... reproduce_hint=Check application logs for statefulset `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... details=${issue_details_str} - ... next_steps=${issue.get('next_steps', 'Review application logs and resolve underlying issues')} - ... observed_at=${issue_timestamp} - END - END - - ${issues_count}= Get Length ${issues} - - # Format scan results for better display - ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results} - - RW.Core.Add Pre To Report **Log Analysis Summary for StatefulSet `${STATEFULSET_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} - - RW.K8sLog.Cleanup Temp Files Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` [Documentation] Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. From 24f4d74167fdf6c38fe83dd4359feb476b32eaf2 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Wed, 11 Feb 2026 18:15:13 +0530 Subject: [PATCH 06/10] \k8s-applog-health: generalize to workload type, drop stacktrace SLI, shorten interval- Templates: use WORKLOAD_NAME + WORKLOAD_TYPE (from match_resource.kind) instead of DEPLOYMENT_NAME; - runbook.robot: remove 'Analyze Workload Stacktraces' task; rely on log pattern analysis only; cleanup temp files inside conditional- sli.robot: replace DEPLOYMENT_NAME with WORKLOAD_NAME/WORKLOAD_TYPE; remove 'Get Stacktrace Health Score' task; final health score from log_health_score only; scale-down timestamp logic only for deployment kind --- .../templates/k8s-applog-health-sli.yaml | 6 +- .../templates/k8s-applog-health-slx.yaml | 4 +- .../templates/k8s-applog-health-taskset.yaml | 4 +- codebundles/k8s-applog-health/runbook.robot | 71 +-------------- codebundles/k8s-applog-health/sli.robot | 88 +++++-------------- 5 files changed, 31 insertions(+), 142 deletions(-) diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml index c4e813650..aa3253be2 100755 --- a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml @@ -26,7 +26,7 @@ spec: pathToRobot: codebundles/k8s-applog-health/sli.robot intervalStrategy: intermezzo intervalSeconds: 600 - description: Measures the health of the application logs for the {{match_resource.resource.metadata.name}} deployment. + description: Measures the health of the application logs for the {{match_resource.resource.metadata.name}} {{match_resource.kind | lower}}. configProvided: - name: NAMESPACE value: {{match_resource.resource.metadata.namespace}} @@ -34,8 +34,10 @@ spec: value: {{context}} - name: KUBERNETES_DISTRIBUTION_BINARY value: {{custom.kubernetes_distribution_binary | default("kubectl")}} - - name: DEPLOYMENT_NAME + - name: WORKLOAD_NAME value: {{match_resource.resource.metadata.name}} + - name: WORKLOAD_TYPE + value: {{match_resource.kind | lower}} - name: CONTAINER_RESTART_AGE value: "10m" - name: CONTAINER_RESTART_THRESHOLD diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml index 7ad748284..a9ca8968c 100644 --- a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-slx.yaml @@ -8,14 +8,14 @@ metadata: {% include "common-annotations.yaml" %} spec: imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/deploy.svg - alias: {{match_resource.resource.metadata.name}} Application Log Health + alias: {{match_resource.resource.metadata.name}} {{match_resource.kind}} Application Log Health asMeasuredBy: The presence of application-level errors/issues/stacktraces in the application logs indicating runtime errors or exceptions in {{match_resource.resource.metadata.name}}. configProvided: - name: OBJECT_NAME value: {{match_resource.resource.metadata.name}} owners: - {{workspace.owner_email}} - statement: Application logs for {{match_resource.resource.metadata.name}} should be free of critical errors/issues/stacktraces indicating runtime errors or exceptions. + statement: Application logs for {{match_resource.resource.metadata.name}} {{match_resource.kind | lower}} should be free of critical errors/issues/stacktraces indicating runtime errors or exceptions. additionalContext: {% include "kubernetes-hierarchy.yaml" ignore missing %} qualified_name: "{{ match_resource.qualified_name }}" diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml index 73229c8e6..e67d520e4 100644 --- a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-taskset.yaml @@ -27,8 +27,10 @@ spec: value: {{context}} - name: KUBERNETES_DISTRIBUTION_BINARY value: {{custom.kubernetes_distribution_binary}} - - name: DEPLOYMENT_NAME + - name: WORKLOAD_NAME value: {{match_resource.resource.metadata.name}} + - name: WORKLOAD_TYPE + value: {{match_resource.kind | lower}} - name: CONTAINER_RESTART_AGE value: "30m" - name: CONTAINER_RESTART_THRESHOLD diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index 299757e56..aa11cb6c3 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -48,7 +48,7 @@ Suite Initialization ... example=otel-demo ${WORKLOAD_NAME}= RW.Core.Import User Variable WORKLOAD_NAME ... type=string - ... description=The name of the workload (deployment, statefulset, or daemonset) to analyze for stacktraces. + ... description=The name of the workload (deployment, statefulset, or daemonset) to analyze for application logs. ... pattern=\w* ... example=otel-demo-frontend ${WORKLOAD_TYPE}= RW.Core.Import User Variable WORKLOAD_TYPE @@ -180,7 +180,6 @@ Suite Initialization ... LOGS_ERROR_PATTERN=${LOGS_ERROR_PATTERN} ... LOGS_EXCLUDE_PATTERN=${LOGS_EXCLUDE_PATTERN} ... ANOMALY_THRESHOLD=${ANOMALY_THRESHOLD} - # ... DEPLOYMENT_NAME=${DEPLOYMENT_NAME} ... WORKLOAD_NAME=${WORKLOAD_NAME} ... WORKLOAD_TYPE=${WORKLOAD_TYPE} ... CONTAINER_RESTART_AGE=${CONTAINER_RESTART_AGE} @@ -260,8 +259,6 @@ Analyze Application Log Patterns for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Name # Set empty log directory to continue with other checks ${log_dir}= Set Variable ${EMPTY} END - - RW.Core.Add Pre To Report **Log Directory:** ${log_dir} # Only scan logs if we have a valid log directory IF '''${log_dir}''' != '''${EMPTY}''' @@ -330,71 +327,5 @@ Analyze Application Log Patterns for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Name RW.Core.Add Pre To Report **Log Analysis Summary for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} - RW.K8sLog.Cleanup Temp Files - END - -Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Collects and analyzes stacktraces/tracebacks from all pods in the workload for troubleshooting application issues. - [Tags] - ... logs - ... stacktraces - ... tracebacks - ... workload - ... troubleshooting - ... errors - ... access:read-only - # Skip pod-related checks if workload is scaled to 0 - IF not ${SKIP_STACKTRACE_CHECKS} - # Convert comma-separated string to list for excluded containers - @{EXCLUDED_CONTAINERS}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List - - # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=${WORKLOAD_TYPE} - ... workload_name=${WORKLOAD_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - ... max_log_lines=${LOG_LINES} - ... max_log_bytes=${LOG_SIZE} - ... excluded_containers=${EXCLUDED_CONTAINERS} - - # Extract stacktraces from the log directory using the traceback library - ${tracebacks}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks - ... logs_dir=${log_dir} - - # Check total number of tracebacks extracted - ${total_tracebacks}= Get Length ${tracebacks} - - IF ${total_tracebacks} == 0 - # No tracebacks found - RW.Core.Add Pre To Report **šŸ“‹ No Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\nLog analysis completed successfully with no stacktraces detected. - ELSE - # Stacktraces found - create issues for each one - ${delimiter}= Evaluate '-' * 80 - - FOR ${traceback} IN @{tracebacks} - ${stacktrace}= Set Variable ${traceback["stacktrace"]} - ${timestamp}= Set Variable ${traceback["timestamp"]} - RW.Core.Add Issue - ... severity=2 - ... expected=No stacktraces should be present in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` - ... actual=Stacktrace detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` - ... title=Stacktrace Detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` - ... reproduce_hint=Check application logs for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` - ... details=${delimiter}\n${stacktrace}\n${delimiter} - ... next_steps=Review application logs for the root cause of the stacktrace\nCheck application configuration and resource limits\nInvestigate the specific error conditions that led to this stacktrace\nConsider scaling or restarting the ${WORKLOAD_TYPE} if issues persist\nMonitor application health and performance metrics - ... next_action=analyseStacktrace - ... observed_at=${timestamp} - END - - # Create consolidated report showing all stacktraces - ${stacktrace_strings}= Evaluate [tb["stacktrace"] for tb in ${tracebacks}] - ${agg_tracebacks}= Evaluate "\\n" + "\\n${delimiter}\\n".join(${stacktrace_strings}) - RW.Core.Add Pre To Report **šŸ” Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Total Stacktraces:** ${total_tracebacks}\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\n${agg_tracebacks} - END - - # Clean up temporary log files RW.K8sLog.Cleanup Temp Files END \ No newline at end of file diff --git a/codebundles/k8s-applog-health/sli.robot b/codebundles/k8s-applog-health/sli.robot index d9a4d1f37..ab920e7f0 100755 --- a/codebundles/k8s-applog-health/sli.robot +++ b/codebundles/k8s-applog-health/sli.robot @@ -8,7 +8,6 @@ Library BuiltIn Library RW.Core Library RW.CLI Library RW.platform -Library RW.LogAnalysis.ExtractTraceback Library RW.K8sLog Library OperatingSystem @@ -74,8 +73,8 @@ Suite Initialization ... type=string ... description=Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. ... pattern=.* - ... example="errors":\s*\[\]|"warnings":\s*\[\] - ... default="errors":\s*\[\]|\\bINFO\\b|\\bDEBUG\\b|\\bTRACE\\b|\\bSTART\\s*-\\s*|\\bSTART\\s*method\\b + ... example="errors":\\s*\\[\\]|"warnings":\\s*\\[\\] + ... default="errors":\\\\s*\\\\[\\\\]|\\\\bINFO\\\\b|\\\\bDEBUG\\\\b|\\\\bTRACE\\\\b|\\\\bSTART\\\\s*-\\\\s*|\\\\bSTART\\\\s*method\\\\b ${EXCLUDED_CONTAINER_NAMES}= RW.Core.Import User Variable EXCLUDED_CONTAINER_NAMES ... type=string ... description=Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). @@ -145,7 +144,7 @@ Suite Initialization # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them IF '${WORKLOAD_TYPE}' == 'daemonset' - Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with stacktrace checks + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log checks Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} ELSE IF ${spec_replicas} == 0 Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas - returning perfect health score @@ -153,12 +152,12 @@ Suite Initialization # For scaled-down workloads, return a score of 1.0 to indicate "intentionally down" vs "broken" Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} ELSE - Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with stacktrace checks + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with log checks Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} END EXCEPT - Log Warning: Failed to check workload scale, continuing with normal stacktrace checks + Log Warning: Failed to check workload scale, continuing with normal log checks Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} END @@ -167,11 +166,11 @@ Get Deployment Scale Down Timestamp [Documentation] Attempts to determine when a deployment was scaled down by examining recent events ${scale_down_info}= Set Variable Unknown - IF ${spec_replicas} == 0 + IF ${spec_replicas} == 0 and '${WORKLOAD_TYPE}' == 'deployment' TRY # Check recent scaling events to find when it was scaled to 0 ${scaling_events}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.reason == "ScalingReplicaSet" and (.message | contains("${DEPLOYMENT_NAME}")) and (.message | contains("to 0"))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.reason == "ScalingReplicaSet" and (.message | contains("${WORKLOAD_NAME}")) and (.message | contains("to 0"))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... timeout_seconds=15 @@ -185,7 +184,7 @@ Get Deployment Scale Down Timestamp ELSE # Try checking replicaset history as fallback ${rs_history}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get replicasets --context ${CONTEXT} -n ${NAMESPACE} -l app=${DEPLOYMENT_NAME} -o json | jq -r '.items[] | select(.spec.replicas == 0) | {creation_time: .metadata.creationTimestamp, name: .metadata.name}' | jq -s 'sort_by(.creation_time) | reverse | .[0] // empty' + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get replicasets --context ${CONTEXT} -n ${NAMESPACE} -l app=${WORKLOAD_NAME} -o json | jq -r '.items[] | select(.spec.replicas == 0) | {creation_time: .metadata.creationTimestamp, name: .metadata.name}' | jq -s 'sort_by(.creation_time) | reverse | .[0] // empty' ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... timeout_seconds=15 @@ -197,7 +196,7 @@ Get Deployment Scale Down Timestamp Log Estimated scale-down time from ReplicaSet: ${scale_down_info} ELSE ${scale_down_info}= Set Variable Unable to determine - no recent scaling events found - Log Could not determine when deployment was scaled down + Log Could not determine when ${WORKLOAD_TYPE} ${WORKLOAD_NAME} was scaled down END END EXCEPT @@ -209,20 +208,20 @@ Get Deployment Scale Down Timestamp RETURN ${scale_down_info} *** Tasks *** -Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` +Get Critical Log Errors and Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` [Documentation] Fetches logs and checks for critical error patterns that indicate application failures. [Tags] logs errors critical patterns # Skip if deployment is scaled down IF ${SKIP_HEALTH_CHECKS} - Log Skipping log analysis - deployment is scaled to 0 replicas + Log Skipping log analysis - ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas ${log_health_score}= Set Variable 1 # Perfect score for scaled deployment Set Suite Variable ${log_health_score} RW.Core.Push Metric ${log_health_score} sub_name=log_errors ELSE ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} ... namespace=${NAMESPACE} ... context=${CONTEXT} ... kubeconfig=${kubeconfig} @@ -230,14 +229,14 @@ Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` ... max_log_lines=${MAX_LOG_LINES} ... max_log_bytes=${MAX_LOG_BYTES} ... excluded_containers=${EXCLUDED_CONTAINERS} - + # Use only critical error patterns for fast SLI checks @{critical_categories}= Create List GenericError AppFailure ${scan_results}= RW.K8sLog.Scan Logs For Issues ... log_dir=${log_dir} - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} ... namespace=${NAMESPACE} ... categories=${critical_categories} ... custom_patterns_file=sli_critical_patterns.json @@ -270,52 +269,7 @@ Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` RW.Core.Push Metric ${log_health_score} sub_name=log_errors END -Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` - [Documentation] Checks for recent stacktraces/tracebacks related to the workload within a short time window, with filtering to reduce noise. - [Tags] stacktraces tracebacks errors recent fast - IF ${SKIP_HEALTH_CHECKS} - # For scaled-down deployments, return perfect score to indicate "intentionally down" vs "broken" - ${stacktrace_score}= Set Variable 1.0 - Set Suite Variable ${stacktrace_details} ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is intentionally scaled to 0 replicas - Score: ${stacktrace_score} - ELSE - # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=${WORKLOAD_TYPE} - ... workload_name=${WORKLOAD_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${RW_LOOKBACK_WINDOW} - ... max_log_lines=${MAX_LOG_LINES} - ... max_log_bytes=${MAX_LOG_BYTES} - ... excluded_containers=${EXCLUDED_CONTAINERS} - - # Extract stacktraces from the log directory - ${recentmost_stacktrace}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks - ... logs_dir=${log_dir} - ... fast_exit=${True} - - ${stacktrace_length}= Get Length ${recentmost_stacktrace} - - IF ${stacktrace_length} != 0 - # Stacktrace found - set score to 0 - ${stacktrace_score}= Set Variable 0 - ${delimiter}= Evaluate '-' * 150 - Set Suite Variable ${stacktrace_details} **Stacktrace(s) identified**:\n${delimiter}\n${recentmost_stacktrace}\n${delimiter} - ELSE - # No stacktraces found - set score to 1 - ${stacktrace_score}= Set Variable 1.0 - Set Suite Variable ${stacktrace_details} **No Stacktraces identified.**\n\nLog analysis completed successfully. - END - - # Clean up temporary log files - RW.K8sLog.Cleanup Temp Files - END - - Set Suite Variable ${stacktrace_score} - RW.Core.Push Metric ${stacktrace_score} sub_name=stacktrace_score - -Generate Application Health Score for `${DEPLOYMENT_NAME}` +Generate Application Health Score for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` [Documentation] Generates the final applog health score and report details [Tags] score health applog @@ -323,11 +277,11 @@ Generate Application Health Score for `${DEPLOYMENT_NAME}` # For scaled-down deployments, return perfect score to indicate "intentionally down" vs "broken" # We distinguish scaled-down vs broken deployments through the log message and report details ${health_score}= Set Variable 1.0 - Log Deployment ${DEPLOYMENT_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} - RW.Core.Add to Report Applog Health Score: ${health_score} - Deployment intentionally scaled to 0 replicas + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} + RW.Core.Add to Report Applog Health Score: ${health_score} - ${WORKLOAD_TYPE} ${WORKLOAD_NAME} intentionally scaled to 0 replicas ELSE - # Use the higher of log health score and stacktrace score as the final health score - ${health_score}= Evaluate max(${log_health_score}, ${stacktrace_score}) + # Use the log health score as the final health score. + ${health_score}= Set Variable ${log_health_score} IF ${health_score} == 1.0 RW.Core.Add to Report Applog Health Score: ${health_score} - No applog issues detected in workload logs From 4573ff4f4df9e2fb980202601217c9cf5344ea86 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Wed, 11 Feb 2026 14:12:36 +0000 Subject: [PATCH 07/10] - shift the "Fetch Deployment Logs" task to applog codebundle - delete the "Detect Log Anomalies" task from daemonset-healthcheck and statefulset-healthcheck" --- codebundles/k8s-applog-health/runbook.robot | 67 ++++++++++++++++++- .../k8s-daemonset-healthcheck/runbook.robot | 50 -------------- .../k8s-deployment-healthcheck/runbook.robot | 65 ------------------ 3 files changed, 66 insertions(+), 116 deletions(-) diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index aa11cb6c3..ed46577b7 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -325,7 +325,72 @@ Analyze Application Log Patterns for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Name ${scan_results_str}= Evaluate json.dumps($scan_results, indent=2) json ${formatted_results}= RW.K8sLog.Format Scan Results For Display scan_results=${scan_results_str} - RW.Core.Add Pre To Report **Log Analysis Summary for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}`**\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} + RW.Core.Add Pre To Report **Log Analysis Summary for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` (Last ${LOG_LINES} lines, ${LOG_AGE} age) **\n**Health Score:** ${log_health_score}\n**Analysis Depth:** ${LOG_ANALYSIS_DEPTH}\n**Categories Analyzed:** ${LOG_PATTERN_CATEGORIES_STR}\n**Issues Found:** ${issues_count}\n\n${formatted_results} RW.K8sLog.Cleanup Temp Files + END + +Fetch Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Fetches and displays workload logs in the report for manual review. Note: Issues are not created by this task - see "Analyze Application Log Patterns" for automated issue detection. + [Tags] + ... logs + ... collection + ... ${WORKLOAD_TYPE} + ... troubleshooting + ... access:read-only + # Skip pod-related checks if deployment is scaled to 0 + IF not ${SKIP_POD_CHECKS} + # Fetch raw logs + ${workload_logs}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} logs ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... show_in_rwl_cheatsheet=true + ... render_in_commandlist=true + + IF ${workload_logs.returncode} == 0 + # Filter logs to remove repetitive health check messages and focus on meaningful content + ${filtered_logs}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | grep -E "(error|ERROR|warn|WARN|exception|Exception|fail|FAIL|fatal|FATAL|panic|stack|trace|timeout|connection.*refused|unable.*connect|authentication.*failed|denied|forbidden|unauthorized|500|502|503|504)" | tail -50 || echo "No significant errors or warnings found in recent logs" + ... env=${env} + ... include_in_history=false + + # Also get a sample of non-health-check logs for context + ${context_logs}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | head -20 | tail -10 + ... env=${env} + ... include_in_history=false + + ${history}= RW.CLI.Pop Shell History + + # Determine if logs are mostly health checks + ${total_lines}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | wc -l + ... env=${env} + ... include_in_history=false + + ${health_check_lines}= RW.CLI.Run Cli + ... cmd=echo "${workload_logs.stdout}" | grep -E "(Checking.*Health|Health.*Check|healthcheck|/health)" | wc -l + ... env=${env} + ... include_in_history=false + + # Handle empty output from wc -l by providing default values + ${total_lines_clean}= Set Variable If "${total_lines.stdout.strip()}" == "" 0 ${total_lines.stdout.strip()} + ${health_check_lines_clean}= Set Variable If "${health_check_lines.stdout.strip()}" == "" 0 ${health_check_lines.stdout.strip()} + + ${total_count}= Convert To Integer ${total_lines_clean} + ${health_count}= Convert To Integer ${health_check_lines_clean} + + # Create consolidated logs report + IF ${health_count} > ${total_count} * 0.8 + ${log_content}= Set Variable If "${context_logs.stdout.strip()}" != "" **šŸ” Filtered Error/Warning Logs:**\n${filtered_logs.stdout}\n\n**šŸ“ Sample Application Logs (Non-Health Check):**\n${context_logs.stdout} **šŸ” Filtered Error/Warning Logs:**\n${filtered_logs.stdout} + RW.Core.Add Pre To Report **šŸ“‹ Raw Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n**ā„¹ļø Logs are mostly health check messages (${health_count}/${total_count} lines)**\n\n${log_content}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. + ELSE + RW.Core.Add Pre To Report **šŸ“‹ Raw Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n\n**šŸ“ Recent Application Logs:**\n${workload_logs.stdout}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. + END + ELSE + # Only add to report if fetch failed, don't create issue + ${history}= RW.CLI.Pop Shell History + RW.Core.Add Pre To Report **šŸ“‹ Raw Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}`**\n\nāš ļø Unable to fetch workload logs (exit code ${workload_logs.returncode}).\n\n**STDERR:** ${workload_logs.stderr}\n\n**Commands Used:** ${history} + END END \ No newline at end of file diff --git a/codebundles/k8s-daemonset-healthcheck/runbook.robot b/codebundles/k8s-daemonset-healthcheck/runbook.robot index c6f0d3d29..e2ea9a654 100644 --- a/codebundles/k8s-daemonset-healthcheck/runbook.robot +++ b/codebundles/k8s-daemonset-healthcheck/runbook.robot @@ -21,56 +21,6 @@ Suite Setup Suite Initialization *** Tasks *** -Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. - [Tags] - ... logs - ... anomalies - ... patterns - ... volume - ... daemonset - ... ${DAEMONSET_NAME} - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${anomaly_results}= RW.K8sLog.Analyze Log Anomalies - ... log_dir=${log_dir} - ... workload_type=daemonset - ... workload_name=${DAEMONSET_NAME} - ... namespace=${NAMESPACE} - - # Process anomaly issues - ${anomaly_issues}= Evaluate $anomaly_results.get('issues', []) - IF len($anomaly_issues) > 0 - FOR ${issue} IN @{anomaly_issues} - ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue["details"]} - ${next_steps_text}= Catenate SEPARATOR=\n @{issue["next_steps"]} - ${issue_timestamp}= Evaluate $issue.get('observed_at', '') - - RW.Core.Add Issue - ... severity=${issue["severity"]} - ... expected=No log anomalies should be present in DaemonSet `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... actual=Log anomalies detected in DaemonSet `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue["title"]} - ... reproduce_hint=Use RW.K8sLog.Analyze Log Anomalies keyword to reproduce this analysis - ... details=${summarized_details} - ... next_steps=${next_steps_text} - ... observed_at=${issue_timestamp} - END - END - - # Add summary to report - ${anomaly_summary}= Catenate SEPARATOR=\n @{anomaly_results["summary"]} - RW.Core.Add Pre To Report Log Anomaly Analysis for DaemonSet ${DAEMONSET_NAME}:\n${anomaly_summary} - - RW.K8sLog.Cleanup Temp Files - Identify Recent Configuration Changes for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` [Documentation] Identifies recent configuration changes from ControllerRevision analysis that might be related to current issues. [Tags] diff --git a/codebundles/k8s-deployment-healthcheck/runbook.robot b/codebundles/k8s-deployment-healthcheck/runbook.robot index cb47daaa7..f825ea8fb 100755 --- a/codebundles/k8s-deployment-healthcheck/runbook.robot +++ b/codebundles/k8s-deployment-healthcheck/runbook.robot @@ -289,71 +289,6 @@ Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMES END END -Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Fetches and displays deployment logs in the report for manual review. Note: Issues are not created by this task - see "Analyze Application Log Patterns" for automated issue detection. - [Tags] - ... logs - ... collection - ... deployment - ... troubleshooting - ... access:read-only - # Skip pod-related checks if deployment is scaled to 0 - IF not ${SKIP_POD_CHECKS} - # Fetch raw logs - ${deployment_logs}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} logs deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... show_in_rwl_cheatsheet=true - ... render_in_commandlist=true - - IF ${deployment_logs.returncode} == 0 - # Filter logs to remove repetitive health check messages and focus on meaningful content - ${filtered_logs}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | grep -E "(error|ERROR|warn|WARN|exception|Exception|fail|FAIL|fatal|FATAL|panic|stack|trace|timeout|connection.*refused|unable.*connect|authentication.*failed|denied|forbidden|unauthorized|500|502|503|504)" | tail -50 || echo "No significant errors or warnings found in recent logs" - ... env=${env} - ... include_in_history=false - - # Also get a sample of non-health-check logs for context - ${context_logs}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -v -E "(Checking.*Health|Health.*Check|healthcheck|/health|GET /|POST /health|probe|liveness|readiness)" | head -20 | tail -10 - ... env=${env} - ... include_in_history=false - - ${history}= RW.CLI.Pop Shell History - - # Determine if logs are mostly health checks - ${total_lines}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | wc -l - ... env=${env} - ... include_in_history=false - - ${health_check_lines}= RW.CLI.Run Cli - ... cmd=echo "${deployment_logs.stdout}" | grep -E "(Checking.*Health|Health.*Check|healthcheck|/health)" | wc -l - ... env=${env} - ... include_in_history=false - - # Handle empty output from wc -l by providing default values - ${total_lines_clean}= Set Variable If "${total_lines.stdout.strip()}" == "" 0 ${total_lines.stdout.strip()} - ${health_check_lines_clean}= Set Variable If "${health_check_lines.stdout.strip()}" == "" 0 ${health_check_lines.stdout.strip()} - - ${total_count}= Convert To Integer ${total_lines_clean} - ${health_count}= Convert To Integer ${health_check_lines_clean} - - # Create consolidated logs report - IF ${health_count} > ${total_count} * 0.8 - ${log_content}= Set Variable If "${context_logs.stdout.strip()}" != "" **šŸ” Filtered Error/Warning Logs:**\n${filtered_logs.stdout}\n\n**šŸ“ Sample Application Logs (Non-Health Check):**\n${context_logs.stdout} **šŸ” Filtered Error/Warning Logs:**\n${filtered_logs.stdout} - RW.Core.Add Pre To Report **šŸ“‹ Raw Deployment Logs for `${DEPLOYMENT_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n**ā„¹ļø Logs are mostly health check messages (${health_count}/${total_count} lines)**\n\n${log_content}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. - ELSE - RW.Core.Add Pre To Report **šŸ“‹ Raw Deployment Logs for `${DEPLOYMENT_NAME}`** (Last ${LOG_LINES} lines, ${LOG_AGE} age)\n**Total Log Lines:** ${total_count} | **Health Check Lines:** ${health_count}\n\n**šŸ“ Recent Application Logs:**\n${deployment_logs.stdout}\n\n**Commands Used:** ${history}\n\n**Note:** Automated issue detection is performed by the "Analyze Application Log Patterns" task. - END - ELSE - # Only add to report if fetch failed, don't create issue - ${history}= RW.CLI.Pop Shell History - RW.Core.Add Pre To Report **šŸ“‹ Raw Deployment Logs for `${DEPLOYMENT_NAME}`**\n\nāš ļø Unable to fetch deployment logs (exit code ${deployment_logs.returncode}).\n\n**STDERR:** ${deployment_logs.stderr}\n\n**Commands Used:** ${history} - END - END - Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` [Documentation] Validates if a Liveness probe has possible misconfigurations [Tags] From cc7700853fee942e527fc229cccdbd33ecc4e76b Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Wed, 11 Feb 2026 19:53:32 +0530 Subject: [PATCH 08/10] added README for k8s-applog-health(the new application log codebundle);removed the redundant "detect log anomalies" task from statefulset-healthcheck --- codebundles/k8s-applog-health/README.md | 56 +++++++++++++++++++ .../k8s-statefulset-healthcheck/runbook.robot | 49 ---------------- 2 files changed, 56 insertions(+), 49 deletions(-) diff --git a/codebundles/k8s-applog-health/README.md b/codebundles/k8s-applog-health/README.md index e69de29bb..5bbe68a13 100644 --- a/codebundles/k8s-applog-health/README.md +++ b/codebundles/k8s-applog-health/README.md @@ -0,0 +1,56 @@ +# Kubernetes Application Log Health + +This codebundle provides tasks for triaging application log health of Kubernetes workloads (deployments, statefulsets, or daemonsets). It fetches pod logs, scans for error patterns, and reports issues with severity and next steps. + +## Tasks + +**Runbook** +- `Analyze Application Log Patterns for ${WORKLOAD_TYPE} ${WORKLOAD_NAME} in Namespace ${NAMESPACE}` — Fetches workload logs, scans for configurable error/exception patterns, creates issues for matches above the severity threshold, and reports a log health score and summary. +- `Fetch Workload Logs for ${WORKLOAD_TYPE} ${WORKLOAD_NAME} in Namespace ${NAMESPACE}` — Fetches and attaches workload logs to the report for manual review (no issue creation). + +**SLI** +- `Get Critical Log Errors and Score for ${WORKLOAD_TYPE} ${WORKLOAD_NAME}` — Fetches logs and scores health based on critical error patterns (e.g. GenericError, AppFailure) and container restarts; pushes a metric for SLI scoring. +- `Generate Application Health Score for ${WORKLOAD_TYPE} ${WORKLOAD_NAME}` — Computes the final applog health score and report details (e.g. scaled-to-zero vs healthy vs issues). + +### Log pattern categories + +Analysis uses pattern categories (configurable via `runbook_patterns.json` or `sli_critical_patterns.json`). Examples: + +- **GenericError** — exception, fatal, panic, crash, failed, failure (severity 1) +- **AppFailure** — application failed, service unavailable, connection refused, timeout, OOM, disk full, auth failures (severity 1) +- **StackTrace** — stack trace, exception in thread, java.lang., traceback, panic (severity 1) +- **Connection** — connection reset/timeout, network unreachable, socket error, DNS resolution failed (severity 2) +- **Timeout** — request/operation timeout, deadline exceeded, read/write timeout (severity 2) +- **Auth** — unauthorized, authentication error, invalid credentials, forbidden, token expired (severity 2) +- **Exceptions** — NullPointerException, IllegalArgumentException, SQLException, IOException, etc. (severity 2) +- **Resource** — resource exhausted, memory leak, CPU throttled, quota/rate limit exceeded (severity 2) +- **HealthyRecovery** — recovered from error, connection restored, retry successful (severity 4, informational) + +Exclude patterns (e.g. INFO/DEBUG/TRACE, health checks, heartbeats) reduce false positives. + +## Configuration + +The TaskSet/SLI requires initialization with secrets and user variables. Key variables: + +- `kubeconfig` — Secret containing cluster access (kubeconfig YAML). +- `KUBERNETES_DISTRIBUTION_BINARY` — CLI binary for Kubernetes (`kubectl` or `oc`). Default: `kubectl`. +- `CONTEXT` — Kubernetes context to use. +- `NAMESPACE` — Namespace of the workload. Leave blank to search all namespaces. +- `WORKLOAD_NAME` — Name of the deployment, statefulset, or daemonset to analyze. +- `WORKLOAD_TYPE` — Type of workload: `deployment`, `statefulset`, or `daemonset`. Default: `deployment`. +- `LOG_AGE` — Age of logs to fetch (e.g. `10m`). Default: `10m`. +- `LOG_LINES` / `LOG_SIZE` — Max lines or bytes per container for runbook log fetch. Defaults: 1000 lines, 2MB. +- `LOG_SEVERITY_THRESHOLD` — Minimum severity to create issues (1=critical … 5=info). Default: 3. +- `LOG_PATTERN_CATEGORIES` — Comma-separated categories to scan (e.g. `GenericError,AppFailure,Connection`). Default includes GenericError, AppFailure, Connection, Timeout, Auth, Exceptions, Resource, HealthyRecovery. +- `LOGS_EXCLUDE_PATTERN` — Regex to exclude lines from analysis (e.g. INFO/DEBUG, health checks). +- `EXCLUDED_CONTAINER_NAMES` — Comma-separated container names to skip (e.g. `linkerd-proxy,istio-proxy`). Default: `linkerd-proxy,istio-proxy,vault-agent`. +- `CONTAINER_RESTART_AGE` / `CONTAINER_RESTART_THRESHOLD` — Time window and threshold for container restarts (SLI). Defaults: e.g. `10m`, `1`. +- `LOG_SCAN_TIMEOUT` — Timeout in seconds for log scanning. Default: 300. + +## Requirements + +- A kubeconfig with RBAC permissions to list pods and read logs for the target workload and namespace. + +## TODO + +- [ ] Add additional documentation. diff --git a/codebundles/k8s-statefulset-healthcheck/runbook.robot b/codebundles/k8s-statefulset-healthcheck/runbook.robot index 84738e855..3a8fa9cde 100644 --- a/codebundles/k8s-statefulset-healthcheck/runbook.robot +++ b/codebundles/k8s-statefulset-healthcheck/runbook.robot @@ -21,55 +21,6 @@ Suite Setup Suite Initialization *** Tasks *** -Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` - [Documentation] Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. - [Tags] - ... logs - ... anomalies - ... patterns - ... volume - ... statefulset - ... ${STATEFULSET_NAME} - ... access:read-only - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${LOG_AGE} - - ${anomaly_results}= RW.K8sLog.Analyze Log Anomalies - ... log_dir=${log_dir} - ... workload_type=statefulset - ... workload_name=${STATEFULSET_NAME} - ... namespace=${NAMESPACE} - - # Process anomaly issues - ${anomaly_issues}= Evaluate $anomaly_results.get('issues', []) - IF len($anomaly_issues) > 0 - FOR ${issue} IN @{anomaly_issues} - ${summarized_details}= RW.K8sLog.Summarize Log Issues issue_details=${issue["details"]} - ${next_steps_text}= Catenate SEPARATOR=\n @{issue["next_steps"]} - - RW.Core.Add Issue - ... severity=${issue["severity"]} - ... expected=No log anomalies should be present in StatefulSet `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... actual=Log anomalies detected in StatefulSet `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` - ... title=${issue["title"]} - ... reproduce_hint=Use RW.K8sLog.Analyze Log Anomalies keyword to reproduce this analysis - ... details=${summarized_details} - ... next_steps=${next_steps_text} - ... observed_at=${issue["observed_at"]} - END - END - - # Add summary to report - ${anomaly_summary}= Catenate SEPARATOR=\n @{anomaly_results["summary"]} - RW.Core.Add Pre To Report Log Anomaly Analysis for StatefulSet ${STATEFULSET_NAME}:\n${anomaly_summary} - - RW.K8sLog.Cleanup Temp Files - Check Liveness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` [Documentation] Validates if a Liveness probe has possible misconfigurations [Tags] From 284c11ee4517da58319aff8366544027a0da530f Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Thu, 12 Feb 2026 10:52:57 +0530 Subject: [PATCH 09/10] add next_action kwarg to distinguish applog issues in platform --- codebundles/k8s-applog-health/runbook.robot | 1 + 1 file changed, 1 insertion(+) diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index ed46577b7..5c7f9ca2e 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -316,6 +316,7 @@ Analyze Application Log Patterns for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Name ... details=${summarized_details} ... next_steps=${next_steps} ... observed_at=${issue_timestamp} + ... next_action=analyzeApplog END END From 34619c1a686c1831af2ae075b659651e54fb66c3 Mon Sep 17 00:00:00 2001 From: Akshay Prabhakant Date: Thu, 12 Feb 2026 19:47:37 +0000 Subject: [PATCH 10/10] - added the stacktrace task and issue creation to applog-health - rectified the k8s-applog-health runbook and sli SKIP_HEALTH_CHECKS evaluation - removed the "Critical Log Errors" sub metric from deployment-healthcheck SLI(looks at logs, this codebundle shouldn't ideally be looking at logs) --- .../templates/k8s-applog-health-sli.yaml | 2 +- codebundles/k8s-applog-health/runbook.robot | 145 ++++++++++--- codebundles/k8s-applog-health/sli.robot | 205 ++++++++++++------ .../k8s-deployment-healthcheck/sli.robot | 67 +----- 4 files changed, 257 insertions(+), 162 deletions(-) diff --git a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml index aa3253be2..34fa0d3f5 100755 --- a/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml +++ b/codebundles/k8s-applog-health/.runwhen/templates/k8s-applog-health-sli.yaml @@ -50,7 +50,7 @@ spec: value: "true" - name: MAX_LOG_LINES value: "1000" - - name: MAX_LOG_SIZE + - name: MAX_LOG_BYTES value: "2097152" secretsProvided: {% if wb_version %} diff --git a/codebundles/k8s-applog-health/runbook.robot b/codebundles/k8s-applog-health/runbook.robot index 5c7f9ca2e..dfd340c09 100755 --- a/codebundles/k8s-applog-health/runbook.robot +++ b/codebundles/k8s-applog-health/runbook.robot @@ -187,41 +187,53 @@ Suite Initialization ... LOG_SCAN_TIMEOUT=${LOG_SCAN_TIMEOUT} Set Suite Variable ${env} ${env_dict} - # Check if the workload is scaled to 0 and handle appropriately - ${scale_check}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... timeout_seconds=30 + # Check if workload is scaled to 0 and handle appropriately + # Different workload types have different field structures - TRY - ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json - ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) - + IF '${WORKLOAD_TYPE}' == 'daemonset' # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them - IF '${WORKLOAD_TYPE}' == 'daemonset' - Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log analysis - Set Suite Variable ${SKIP_POD_CHECKS} ${False} - ELSE IF ${spec_replicas} == 0 - RW.Core.Add Issue - ... severity=4 - ... expected=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` operational status documented - ... actual=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is intentionally scaled to zero replicas - ... title=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is Scaled Down (Informational) - ... reproduce_hint=kubectl get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o yaml - ... details=${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is currently scaled to 0 replicas (spec.replicas=0). This is an intentional configuration and not an error. All pod-related healthchecks have been skipped for efficiency. If the workload should be running, scale it up using:\nkubectl scale ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --replicas= --context ${CONTEXT} -n ${NAMESPACE} - ... next_steps=This is informational only. If the workload should be running, scale it up. - - RW.Core.Add Pre To Report **ā„¹ļø ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is scaled to 0 replicas - Skipping log analysis**\n**Available Condition:** ${scale_status.get('available_condition', 'Unknown')} - - Set Suite Variable ${SKIP_POD_CHECKS} ${True} + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + ELSE + IF '${WORKLOAD_TYPE}' == 'statefulset' + # StatefulSet: use current/updated replicas in addition to spec/ready + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), current_replicas: (.status.currentReplicas // 0), updated_replicas: (.status.updatedReplicas // 0)}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 ELSE - Set Suite Variable ${SKIP_POD_CHECKS} ${False} + # For deployments + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 END - EXCEPT - Log Warning: Failed to check workload scale, continuing with normal checks - Set Suite Variable ${SKIP_POD_CHECKS} ${False} + TRY + ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json + ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) + + # Try to determine when deployment was scaled down by checking recent events and replica set history + ${scale_down_info}= Get Deployment Scale Down Timestamp ${spec_replicas} + + IF ${spec_replicas} == 0 + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas - returning special health score + Log Scale down detected at: ${scale_down_info} + + # For scaled-down workloads, return a score of 1.0 to indicate "intentionally down" vs "broken" + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} + Set Suite Variable ${SCALED_DOWN_INFO} ${scale_down_info} + ELSE + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + + EXCEPT + Log Warning: Failed to check workload scale, continuing with normal log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END END @@ -239,7 +251,7 @@ Analyze Application Log Patterns for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Name ... ${WORKLOAD_TYPE} ... access:read-only # Skip pod-related checks if deployment is scaled to 0 - IF not ${SKIP_POD_CHECKS} + IF not ${SKIP_HEALTH_CHECKS} # Temporarily suppress log warnings for excluded containers (they're expected) TRY ${log_dir}= RW.K8sLog.Fetch Workload Logs @@ -340,7 +352,7 @@ Fetch Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` in Namespace `${NA ... troubleshooting ... access:read-only # Skip pod-related checks if deployment is scaled to 0 - IF not ${SKIP_POD_CHECKS} + IF not ${SKIP_HEALTH_CHECKS} # Fetch raw logs ${workload_logs}= RW.CLI.Run Cli ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} logs ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} --tail=${LOG_LINES} --since=${LOG_AGE} @@ -394,4 +406,71 @@ Fetch Workload Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` in Namespace `${NA ${history}= RW.CLI.Pop Shell History RW.Core.Add Pre To Report **šŸ“‹ Raw Logs for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}`**\n\nāš ļø Unable to fetch workload logs (exit code ${workload_logs.returncode}).\n\n**STDERR:** ${workload_logs.stderr}\n\n**Commands Used:** ${history} END - END \ No newline at end of file + END + + +Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + [Documentation] Collects and analyzes stacktraces/tracebacks from all pods in the workload for troubleshooting application issues. + [Tags] + ... logs + ... stacktraces + ... tracebacks + ... ${WORKLOAD_TYPE} + ... troubleshooting + ... errors + ... access:read-only + # Skip pod-related checks if workload is scaled to 0 + IF not ${SKIP_HEALTH_CHECKS} + # Convert comma-separated string to list for excluded containers + @{EXCLUDED_CONTAINERS}= Run Keyword If "${EXCLUDED_CONTAINER_NAMES}" != "" Split String ${EXCLUDED_CONTAINER_NAMES} , ELSE Create List + + # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${LOG_AGE} + ... max_log_lines=${LOG_LINES} + ... max_log_bytes=${LOG_SIZE} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Extract stacktraces from the log directory using the traceback library + ${tracebacks}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks + ... logs_dir=${log_dir} + + # Check total number of tracebacks extracted + ${total_tracebacks}= Get Length ${tracebacks} + + IF ${total_tracebacks} == 0 + # No tracebacks found + RW.Core.Add Pre To Report **šŸ“‹ No Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\nLog analysis completed successfully with no stacktraces detected. + ELSE + # Stacktraces found - create issues for each one + ${delimiter}= Evaluate '-' * 80 + + FOR ${traceback} IN @{tracebacks} + ${stacktrace}= Set Variable ${traceback["stacktrace"]} + ${timestamp}= Set Variable ${traceback["timestamp"]} + RW.Core.Add Issue + ... severity=2 + ... expected=No stacktraces should be present in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` + ... actual=Stacktrace detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` logs in namespace `${NAMESPACE}` + ... title=Stacktrace Detected in ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + ... reproduce_hint=Check application logs for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in namespace `${NAMESPACE}` + ... details=${delimiter}\n${stacktrace}\n${delimiter} + ... next_steps=Review application logs for the root cause of the stacktrace\nCheck application configuration and resource limits\nInvestigate the specific error conditions that led to this stacktrace\nConsider scaling or restarting the ${WORKLOAD_TYPE} if issues persist\nMonitor application health and performance metrics + ... next_action=analyseStacktrace + ... observed_at=${timestamp} + END + + # Create consolidated report showing all stacktraces + ${stacktrace_strings}= Evaluate [tb["stacktrace"] for tb in ${tracebacks}] + ${agg_tracebacks}= Evaluate "\\n" + "\\n${delimiter}\\n".join(${stacktrace_strings}) + RW.Core.Add Pre To Report **šŸ” Stacktraces Found for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`**\n**Total Stacktraces:** ${total_tracebacks}\n**Log Analysis Period:** ${LOG_AGE}\n**Max Log Lines:** ${LOG_LINES}\n**Max Log Size:** ${LOG_SIZE} bytes\n**Excluded Containers:** ${EXCLUDED_CONTAINER_NAMES}\n\n${agg_tracebacks} + END + + # Clean up temporary log files + RW.K8sLog.Cleanup Temp Files + END diff --git a/codebundles/k8s-applog-health/sli.robot b/codebundles/k8s-applog-health/sli.robot index ab920e7f0..ce65ddeac 100755 --- a/codebundles/k8s-applog-health/sli.robot +++ b/codebundles/k8s-applog-health/sli.robot @@ -9,6 +9,7 @@ Library RW.Core Library RW.CLI Library RW.platform Library RW.K8sLog +Library RW.LogAnalysis.ExtractTraceback Library OperatingSystem Library String @@ -123,42 +124,51 @@ Suite Initialization # Check if workload is scaled to 0 and handle appropriately # Different workload types have different field structures + IF '${WORKLOAD_TYPE}' == 'daemonset' - ${scale_check}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .status.desiredNumberScheduled, ready_replicas: (.status.numberReady // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... timeout_seconds=30 + # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} ELSE - # For deployments and statefulsets - ${scale_check}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... timeout_seconds=30 - END - - TRY - ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json - ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) + IF '${WORKLOAD_TYPE}' == 'statefulset' + # StatefulSet: use current/updated replicas in addition to spec/ready + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), current_replicas: (.status.currentReplicas // 0), updated_replicas: (.status.updatedReplicas // 0)}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + ELSE + # For deployments + ${scale_check}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get ${WORKLOAD_TYPE}/${WORKLOAD_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | jq '{spec_replicas: .spec.replicas, ready_replicas: (.status.readyReplicas // 0), available_condition: (.status.conditions[] | select(.type == "Available") | .status // "Unknown")}' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + END - # DaemonSets don't scale to 0 in the traditional sense, so skip scale-down logic for them - IF '${WORKLOAD_TYPE}' == 'daemonset' - Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is a DaemonSet - proceeding with log checks - Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} - ELSE IF ${spec_replicas} == 0 - Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas - returning perfect health score + TRY + ${scale_status}= Evaluate json.loads(r'''${scale_check.stdout}''') if r'''${scale_check.stdout}'''.strip() else {} json + ${spec_replicas}= Evaluate $scale_status.get('spec_replicas', 1) + + # Try to determine when deployment was scaled down by checking recent events and replica set history + ${scale_down_info}= Get Deployment Scale Down Timestamp ${spec_replicas} - # For scaled-down workloads, return a score of 1.0 to indicate "intentionally down" vs "broken" - Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} - ELSE - Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with log checks + IF ${spec_replicas} == 0 + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is scaled to 0 replicas - returning special health score + Log Scale down detected at: ${scale_down_info} + + # For scaled-down workloads, return a score of 1.0 to indicate "intentionally down" vs "broken" + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${True} + Set Suite Variable ${SCALED_DOWN_INFO} ${scale_down_info} + ELSE + Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} has ${spec_replicas} desired replicas - proceeding with log checks + Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} + END + + EXCEPT + Log Warning: Failed to check workload scale, continuing with normal log checks Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} END - - EXCEPT - Log Warning: Failed to check workload scale, continuing with normal log checks - Set Suite Variable ${SKIP_HEALTH_CHECKS} ${False} END Get Deployment Scale Down Timestamp @@ -166,45 +176,70 @@ Get Deployment Scale Down Timestamp [Documentation] Attempts to determine when a deployment was scaled down by examining recent events ${scale_down_info}= Set Variable Unknown - IF ${spec_replicas} == 0 and '${WORKLOAD_TYPE}' == 'deployment' - TRY - # Check recent scaling events to find when it was scaled to 0 - ${scaling_events}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.reason == "ScalingReplicaSet" and (.message | contains("${WORKLOAD_NAME}")) and (.message | contains("to 0"))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' - ... env=${env} - ... secret_file__kubeconfig=${kubeconfig} - ... timeout_seconds=15 - - IF '''${scaling_events.stdout}''' != '' - ${event_data}= Evaluate json.loads(r'''${scaling_events.stdout}''') if r'''${scaling_events.stdout}'''.strip() else {} json - ${timestamp}= Evaluate $event_data.get('timestamp', 'Unknown') - ${message}= Evaluate $event_data.get('message', 'Unknown') - ${scale_down_info}= Set Variable ${timestamp} (${message}) - Log Found scale-down event: ${scale_down_info} - ELSE - # Try checking replicaset history as fallback - ${rs_history}= RW.CLI.Run Cli - ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get replicasets --context ${CONTEXT} -n ${NAMESPACE} -l app=${WORKLOAD_NAME} -o json | jq -r '.items[] | select(.spec.replicas == 0) | {creation_time: .metadata.creationTimestamp, name: .metadata.name}' | jq -s 'sort_by(.creation_time) | reverse | .[0] // empty' + IF ${spec_replicas} == 0 + IF '${WORKLOAD_TYPE}' == 'deployment' + TRY + # Check recent scaling events to find when it was scaled to 0 + ${scaling_events}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.reason == "ScalingReplicaSet" and (.message | contains("${WORKLOAD_NAME}")) and (.message | contains("to 0"))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=15 + + IF '''${scaling_events.stdout}''' != '' + ${event_data}= Evaluate json.loads(r'''${scaling_events.stdout}''') if r'''${scaling_events.stdout}'''.strip() else {} json + ${timestamp}= Evaluate $event_data.get('timestamp', 'Unknown') + ${message}= Evaluate $event_data.get('message', 'Unknown') + ${scale_down_info}= Set Variable ${timestamp} (${message}) + Log Found scale-down event: ${scale_down_info} + ELSE + # Try checking replicaset history as fallback + ${rs_history}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get replicasets --context ${CONTEXT} -n ${NAMESPACE} -l app=${WORKLOAD_NAME} -o json | jq -r '.items[] | select(.spec.replicas == 0) | {creation_time: .metadata.creationTimestamp, name: .metadata.name}' | jq -s 'sort_by(.creation_time) | reverse | .[0] // empty' + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=15 + + IF '''${rs_history.stdout}''' != '' + ${rs_data}= Evaluate json.loads(r'''${rs_history.stdout}''') if r'''${rs_history.stdout}'''.strip() else {} json + ${rs_time}= Evaluate $rs_data.get('creation_time', 'Unknown') + ${scale_down_info}= Set Variable Likely around ${rs_time} (based on ReplicaSet history) + Log Estimated scale-down time from ReplicaSet: ${scale_down_info} + ELSE + ${scale_down_info}= Set Variable Unable to determine - no recent scaling events found + Log Could not determine when ${WORKLOAD_TYPE} ${WORKLOAD_NAME} was scaled down + END + END + EXCEPT + Log Warning: Failed to determine scale-down timestamp + ${scale_down_info}= Set Variable Failed to determine scale-down time + END + ELSE IF '${WORKLOAD_TYPE}' == 'statefulset' + TRY + # StatefulSet: find scale-to-0 event via involvedObject + ${scaling_events}= RW.CLI.Run Cli + ... cmd=${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} --sort-by='.lastTimestamp' -o json | jq -r '.items[] | select(.involvedObject.kind == "StatefulSet" and .involvedObject.name == "${WORKLOAD_NAME}" and (.message | contains("to 0") or (contains("delete Pod") and contains("successful")))) | {timestamp: .lastTimestamp, message: .message}' | jq -s 'sort_by(.timestamp) | reverse | .[0] // empty' ... env=${env} ... secret_file__kubeconfig=${kubeconfig} ... timeout_seconds=15 - IF '''${rs_history.stdout}''' != '' - ${rs_data}= Evaluate json.loads(r'''${rs_history.stdout}''') if r'''${rs_history.stdout}'''.strip() else {} json - ${rs_time}= Evaluate $rs_data.get('creation_time', 'Unknown') - ${scale_down_info}= Set Variable Likely around ${rs_time} (based on ReplicaSet history) - Log Estimated scale-down time from ReplicaSet: ${scale_down_info} + IF '''${scaling_events.stdout}''' != '' + ${event_data}= Evaluate json.loads(r'''${scaling_events.stdout}''') if r'''${scaling_events.stdout}'''.strip() else {} json + ${timestamp}= Evaluate $event_data.get('timestamp', 'Unknown') + ${message}= Evaluate $event_data.get('message', 'Unknown') + ${scale_down_info}= Set Variable ${timestamp} (${message}) + Log Found scale-down event: ${scale_down_info} ELSE - ${scale_down_info}= Set Variable Unable to determine - no recent scaling events found + ${scale_down_info}= Set Variable Unable to determine - no recent scaling events found for StatefulSet Log Could not determine when ${WORKLOAD_TYPE} ${WORKLOAD_NAME} was scaled down END + EXCEPT + Log Warning: Failed to determine scale-down timestamp for StatefulSet + ${scale_down_info}= Set Variable Failed to determine scale-down time END - EXCEPT - Log Warning: Failed to determine scale-down timestamp - ${scale_down_info}= Set Variable Failed to determine scale-down time END END - + RETURN ${scale_down_info} *** Tasks *** @@ -269,6 +304,51 @@ Get Critical Log Errors and Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` RW.Core.Push Metric ${log_health_score} sub_name=log_errors END +Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + [Documentation] Checks for recent stacktraces/tracebacks related to the workload within a short time window, with filtering to reduce noise. + [Tags] stacktraces tracebacks errors recent fast + IF ${SKIP_HEALTH_CHECKS} + # For scaled-down deployments, return perfect score to indicate "intentionally down" vs "broken" + ${stacktrace_score}= Set Variable 1.0 + Set Suite Variable ${stacktrace_details} ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` is intentionally scaled to 0 replicas - Score: ${stacktrace_score} + ELSE + # Fetch logs using RW.K8sLog library (same pattern as deployment healthcheck) + ${log_dir}= RW.K8sLog.Fetch Workload Logs + ... workload_type=${WORKLOAD_TYPE} + ... workload_name=${WORKLOAD_NAME} + ... namespace=${NAMESPACE} + ... context=${CONTEXT} + ... kubeconfig=${kubeconfig} + ... log_age=${RW_LOOKBACK_WINDOW} + ... max_log_lines=${MAX_LOG_LINES} + ... max_log_bytes=${MAX_LOG_BYTES} + ... excluded_containers=${EXCLUDED_CONTAINERS} + + # Extract stacktraces from the log directory + ${recentmost_stacktrace}= RW.LogAnalysis.ExtractTraceback.Extract Tracebacks + ... logs_dir=${log_dir} + ... fast_exit=${True} + + ${stacktrace_length}= Get Length ${recentmost_stacktrace} + + IF ${stacktrace_length} != 0 + # Stacktrace found - set score to 0 + ${stacktrace_score}= Set Variable 0 + ${delimiter}= Evaluate '-' * 150 + Set Suite Variable ${stacktrace_details} **Stacktrace(s) identified**:\n${delimiter}\n${recentmost_stacktrace}\n${delimiter} + ELSE + # No stacktraces found - set score to 1 + ${stacktrace_score}= Set Variable 1.0 + Set Suite Variable ${stacktrace_details} **No Stacktraces identified.**\n\nLog analysis completed successfully. + END + + # Clean up temporary log files + RW.K8sLog.Cleanup Temp Files + END + + Set Suite Variable ${stacktrace_score} + RW.Core.Push Metric ${stacktrace_score} sub_name=stacktrace_score + Generate Application Health Score for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` [Documentation] Generates the final applog health score and report details [Tags] score health applog @@ -278,15 +358,14 @@ Generate Application Health Score for `${WORKLOAD_TYPE}` `${WORKLOAD_NAME}` # We distinguish scaled-down vs broken deployments through the log message and report details ${health_score}= Set Variable 1.0 Log ${WORKLOAD_TYPE} ${WORKLOAD_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} - RW.Core.Add to Report Applog Health Score: ${health_score} - ${WORKLOAD_TYPE} ${WORKLOAD_NAME} intentionally scaled to 0 replicas ELSE # Use the log health score as the final health score. - ${health_score}= Set Variable ${log_health_score} + ${health_score}= Set Variable min(${log_health_score}, ${stacktrace_score}) IF ${health_score} == 1.0 - RW.Core.Add to Report Applog Health Score: ${health_score} - No applog issues detected in workload logs + RW.Core.Add to Report Applog Health Score: ${health_score} - No applog issues or stacktraces detected in workload logs ELSE - RW.Core.Add to Report Applog Health Score: ${health_score} - Applog issue(s) detected in workload logs: ${log_health_details} + RW.Core.Add to Report Applog Health Score: ${health_score} - Applog issue(s) or stacktrace(s) detected in workload logs: ${log_health_details} END END RW.Core.Push Metric ${health_score} \ No newline at end of file diff --git a/codebundles/k8s-deployment-healthcheck/sli.robot b/codebundles/k8s-deployment-healthcheck/sli.robot index 7571c55e0..ef4f08444 100755 --- a/codebundles/k8s-deployment-healthcheck/sli.robot +++ b/codebundles/k8s-deployment-healthcheck/sli.robot @@ -129,7 +129,6 @@ Suite Initialization # Initialize score variables Set Suite Variable ${container_restart_score} 0 - Set Suite Variable ${log_health_score} 0 Set Suite Variable ${pods_notready_score} 0 Set Suite Variable ${replica_score} 0 Set Suite Variable ${events_score} 0 @@ -246,67 +245,6 @@ Get Container Restarts and Score for Deployment `${DEPLOYMENT_NAME}` RW.Core.Push Metric ${container_restart_score} sub_name=container_restarts END -Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` - [Documentation] Fetches logs and checks for critical error patterns that indicate application failures. - [Tags] logs errors critical patterns - - # Skip if deployment is scaled down - IF ${SKIP_HEALTH_CHECKS} - Log Skipping log analysis - deployment is scaled to 0 replicas - ${log_health_score}= Set Variable 1 # Perfect score for scaled deployment - Set Suite Variable ${log_health_score} - RW.Core.Push Metric ${log_health_score} sub_name=log_errors - ELSE - ${log_dir}= RW.K8sLog.Fetch Workload Logs - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... context=${CONTEXT} - ... kubeconfig=${kubeconfig} - ... log_age=${RW_LOOKBACK_WINDOW} - ... max_log_lines=${MAX_LOG_LINES} - ... max_log_bytes=${MAX_LOG_BYTES} - ... excluded_containers=${EXCLUDED_CONTAINERS} - - # Use only critical error patterns for fast SLI checks - @{critical_categories}= Create List GenericError AppFailure - - ${scan_results}= RW.K8sLog.Scan Logs For Issues - ... log_dir=${log_dir} - ... workload_type=deployment - ... workload_name=${DEPLOYMENT_NAME} - ... namespace=${NAMESPACE} - ... categories=${critical_categories} - ... custom_patterns_file=sli_critical_patterns.json - ... excluded_containers=${EXCLUDED_CONTAINERS} - - # Post-process results to filter out patterns matching LOGS_EXCLUDE_PATTERN - TRY - IF $LOGS_EXCLUDE_PATTERN != "" - ${filtered_issues}= Evaluate [issue for issue in $scan_results.get('issues', []) if not __import__('re').search('${LOGS_EXCLUDE_PATTERN}', issue.get('details', ''), __import__('re').IGNORECASE)] modules=re - ${filtered_results}= Evaluate {**$scan_results, 'issues': $filtered_issues} - Set Test Variable ${scan_results} ${filtered_results} - END - EXCEPT - Log Warning: Failed to apply LOGS_EXCLUDE_PATTERN filter, using unfiltered results - END - - ${log_health_score}= RW.K8sLog.Calculate Log Health Score scan_results=${scan_results} - - # Store details for final score calculation logging - TRY - ${issues}= Evaluate $scan_results.get('issues', []) - ${issue_count}= Get Length ${issues} - Set Suite Variable ${log_health_details} ${issue_count} issues found - EXCEPT - Set Suite Variable ${log_health_details} analysis completed - END - - Set Suite Variable ${log_health_score} - RW.K8sLog.Cleanup Temp Files - RW.Core.Push Metric ${log_health_score} sub_name=log_errors - END - Get NotReady Pods Score for Deployment `${DEPLOYMENT_NAME}` [Documentation] Fetches a count of unready pods for the specific deployment. [Tags] access:read-only Pods Status Phase Ready Unready Running @@ -411,13 +349,12 @@ Generate Deployment Health Score for `${DEPLOYMENT_NAME}` Log Deployment ${DEPLOYMENT_NAME} is intentionally scaled to 0 replicas (${SCALED_DOWN_INFO}) - Score: ${health_score} ELSE # Calculate the normal health score - ${active_checks}= Set Variable 5 - ${deployment_health_score}= Evaluate (${container_restart_score} + ${log_health_score} + ${pods_notready_score} + ${replica_score} + ${events_score}) / ${active_checks} + ${active_checks}= Set Variable 4 + ${deployment_health_score}= Evaluate (${container_restart_score} + ${pods_notready_score} + ${replica_score} + ${events_score}) / ${active_checks} ${health_score}= Convert to Number ${deployment_health_score} 2 # Create a single line showing unhealthy components IF ${container_restart_score} < 1 Append To List ${unhealthy_components} Container Restarts (${container_restart_details}) - IF ${log_health_score} < 0.8 Append To List ${unhealthy_components} Log Health (${log_health_details}) IF ${pods_notready_score} < 1 Append To List ${unhealthy_components} Pod Readiness (${pod_readiness_details}) IF ${replica_score} < 1 Append To List ${unhealthy_components} Replica Status (${replica_details}) IF ${events_score} < 1 Append To List ${unhealthy_components} Warning Events (${events_details})