From 8bfc0d2ac9be8f7933200f6fc1b45cbeebd1376c Mon Sep 17 00:00:00 2001 From: Simran Date: Fri, 26 Jun 2026 11:59:34 -0700 Subject: [PATCH] Add spoke cluster pre/post-upgrade OCP health check steps for ACM interop --- .../spoke-upgrade-healthcheck/OWNERS | 3 + .../spoke-upgrade-healthcheck/README.md | 57 +++ ...-p2p-spoke-upgrade-healthcheck-commands.sh | 471 ++++++++++++++++++ ...poke-upgrade-healthcheck-ref.metadata.json | 11 + ...rop-p2p-spoke-upgrade-healthcheck-ref.yaml | 37 ++ .../spoke-upgrade-prehealthcheck/OWNERS | 3 + .../spoke-upgrade-prehealthcheck/README.md | 31 ++ ...p-spoke-upgrade-prehealthcheck-commands.sh | 454 +++++++++++++++++ ...e-upgrade-prehealthcheck-ref.metadata.json | 11 + ...-p2p-spoke-upgrade-prehealthcheck-ref.yaml | 28 ++ 10 files changed, 1106 insertions(+) create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/OWNERS create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/README.md create mode 100755 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-commands.sh create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.metadata.json create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.yaml create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/OWNERS create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/README.md create mode 100755 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-commands.sh create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.metadata.json create mode 100644 ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.yaml diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/OWNERS b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/OWNERS new file mode 100644 index 0000000000000..a95e6bb67979e --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/OWNERS @@ -0,0 +1,3 @@ +approvers: &owners +- cspi-qe-ocp-lp +reviewers: *owners diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/README.md b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/README.md new file mode 100644 index 0000000000000..a6ce494f02b05 --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/README.md @@ -0,0 +1,57 @@ +# acm-interop-p2p-spoke-upgrade-healthcheck + +Post-upgrade health check for the **ACM managed spoke** after `acm-interop-p2p-spoke-upgrade`. + +## Files + +| File | Purpose | +|------|---------| +| `acm-interop-p2p-spoke-upgrade-healthcheck-ref.yaml` | Step registry ref (verification-tests image, same as cucushift) | +| `acm-interop-p2p-spoke-upgrade-healthcheck-commands.sh` | Sets spoke `KUBECONFIG`, then runs cucushift upgrade healthcheck logic | + +The commands script embeds the body of +[`cucushift-upgrade-healthcheck-commands.sh`](../../../cucushift/upgrade/healthcheck/cucushift-upgrade-healthcheck-commands.sh) +without modifying the upstream file. When updating health check behavior, change cucushift first, then +refresh the embedded section in `acm-interop-p2p-spoke-upgrade-healthcheck-commands.sh`. + +## Checks + +1. MachineConfigPools — not Updating/Degraded, stable for 5 minutes (wait budget `max(ACM_SPOKE_MCP_READY_TIMEOUT_MINUTES, nodes × ACM_SPOKE_MCP_MINUTES_PER_NODE)`; defaults 210m / 35m per node) +2. Cluster operators — Available, not Progressing, not Degraded (`ACM_SPOKE_CO_READY_TIMEOUT_MINUTES`, default 45m) +3. Nodes — all Ready +4. Pods — status dump for reference + +## Step timeout + +| Setting | Default | Purpose | +|---------|---------|---------| +| `timeout` (ref.yaml) | 5h | MCP + CO budgets + margin (fits within 20h job) | +| `ACM_SPOKE_MCP_READY_TIMEOUT_MINUTES` | 210 | MCP wait floor | +| `ACM_SPOKE_MCP_MINUTES_PER_NODE` | 35 | MCP wait per node | +| `ACM_SPOKE_CO_READY_TIMEOUT_MINUTES` | 45 | CO stability poll budget | +| `grace_period` | 10m | EXIT trap diagnostics after step failure | + +## Requirements + +| File | Source | +|------|--------| +| `${SHARED_DIR}/kubeconfig` | Hub cluster (ipi-install) | +| Spoke admin kubeconfig | `${SHARED_DIR}/managed-cluster-kubeconfig` when names match, or Hive `ClusterDeployment` secret per spoke | + +Optional env `ACM_INTEROP_P2P__HEALTHCHECK__SPOKE_CLUSTERS` limits which spokes are checked. +When unset, all `ManagedCluster` resources except `local-cluster` are checked. + +## Artifacts on failure + +| File | Content | +|------|---------| +| `spoke--upgrade-healthcheck-failure.txt` | ClusterVersion, MCP describe, not-Ready node describe, unhealthy CO describe, MCO pods | + +## Typical workflow placement + +```yaml +test: +- ref: acm-interop-p2p-spoke-upgrade +- ref: acm-interop-p2p-spoke-upgrade-healthcheck +- ref: interop-tests-openshift-virtualization-upgrade-tests +``` diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-commands.sh b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-commands.sh new file mode 100755 index 0000000000000..41ea11084aaaf --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-commands.sh @@ -0,0 +1,471 @@ +#!/bin/bash +# +# Post-upgrade health check on ACM managed spoke cluster(s). +# Health check logic mirrors cucushift-upgrade-healthcheck-commands.sh +# (ci-operator/step-registry/cucushift/upgrade/healthcheck/) with spoke-specific MCP wait tuning. +# +set -euxo pipefail; shopt -s inherit_errexit + +typeset hubKubeconfig="${SHARED_DIR}/kubeconfig" +typeset spokeName='spoke' +typeset -a spokeNamesArr=() +typeset -a failedSpokesArr=() + +[[ -f "${hubKubeconfig}" ]] || { + echo "[ERROR] Hub kubeconfig not found: ${hubKubeconfig}" >&2 + exit 1 +} + +WriteSpokeHealthcheckFailureDiagnostics() { + typeset artifactFile="${ARTIFACT_DIR}/spoke-${spokeName}-upgrade-healthcheck-failure.txt" + typeset unhealthyMcp mcpName nodeName coName + + { + echo "=== oc get clusterversion ===" + oc get clusterversion version -o wide 2>&1 || true + echo + echo "=== oc describe clusterversion version ===" + oc describe clusterversion version 2>&1 || true + echo + echo "=== oc get machineconfigpools ===" + oc get machineconfigpools 2>&1 || true + echo + echo "=== MCP custom-columns (UPDATING/DEGRADED) ===" + oc get machineconfigpools \ + -o 'custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?(@.type=="Updating")].status,DEGRADED:status.conditions[?(@.type=="Degraded")].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount' \ + 2>&1 || true + unhealthyMcp="$(oc get machineconfigpools \ + -o 'custom-columns=NAME:metadata.name,UPDATING:status.conditions[?(@.type=="Updating")].status,DEGRADED:status.conditions[?(@.type=="Degraded")].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount' \ + --no-headers 2>/dev/null | grep -Ev '[[:space:]]False[[:space:]]+False[[:space:]]+0[[:space:]]*$' || true)" + if [[ -n "${unhealthyMcp}" ]]; then + echo + echo "=== oc describe unhealthy MCPs ===" + while read -r mcpName _; do + [[ -n "${mcpName}" ]] || continue + echo "--- ${mcpName} ---" + oc describe machineconfigpool "${mcpName}" 2>&1 || true + done <<<"${unhealthyMcp}" + fi + echo + echo "=== oc get nodes ===" + oc get nodes -o wide 2>&1 || true + echo + echo "=== oc describe not-Ready nodes ===" + while read -r nodeName _; do + [[ -n "${nodeName}" ]] || continue + echo "--- ${nodeName} ---" + oc describe node "${nodeName}" 2>&1 || true + done < <(oc get nodes --no-headers 2>/dev/null | awk '$2 != "Ready" {print $1}' || true) + echo + echo "=== oc get clusteroperators ===" + oc get clusteroperators 2>&1 || true + echo + echo "=== oc describe unhealthy clusteroperators ===" + while read -r coName _; do + [[ -n "${coName}" ]] || continue + echo "--- ${coName} ---" + oc describe clusteroperator "${coName}" 2>&1 || true + done < <(oc get clusteroperators --no-headers 2>/dev/null | awk '$3 == "False" || $4 == "True" || $5 == "True" {print $1}' || true) + echo + echo "=== oc get pods -n openshift-machine-config-operator ===" + oc get pods -n openshift-machine-config-operator -o wide 2>&1 || true + } > "${artifactFile}" + : "Wrote spoke upgrade healthcheck diagnostics to ${artifactFile}" + true +} + +SpokeHealthcheckFailureCleanup() { + typeset ret=$? + if (( ret != 0 )); then + WriteSpokeHealthcheckFailureDiagnostics || true + fi + return "${ret}" +} + +DiscoverSpokeClusters() { + typeset -n spokeNamesRef="${1:?}" + typeset -a rawSpokeNamesArr=() + typeset spokeClusterName + + spokeNamesRef=() + if [[ -n "${ACM_INTEROP_P2P__HEALTHCHECK__SPOKE_CLUSTERS:-}" ]]; then + IFS=',' read -r -a rawSpokeNamesArr <<< "${ACM_INTEROP_P2P__HEALTHCHECK__SPOKE_CLUSTERS}" + for spokeClusterName in "${rawSpokeNamesArr[@]}"; do + spokeClusterName="$(echo -n "${spokeClusterName}" | xargs)" + [[ -n "${spokeClusterName}" ]] || { + echo "[ERROR] Empty spoke name in ACM_INTEROP_P2P__HEALTHCHECK__SPOKE_CLUSTERS" >&2 + return 1 + } + spokeNamesRef+=("${spokeClusterName}") + done + : "Using spoke list from ACM_INTEROP_P2P__HEALTHCHECK__SPOKE_CLUSTERS: ${spokeNamesRef[*]}" + return 0 + fi + + mapfile -t spokeNamesRef < <( + oc get managedcluster \ + -o jsonpath-as-json='{.items[*].metadata.name}' | + jq -r '.[] | select(. != "local-cluster")' + ) + if [[ ${#spokeNamesRef[@]} -eq 0 ]]; then + echo "[ERROR] No managed spoke clusters found on hub" >&2 + return 1 + fi + + : "Discovered managed spoke clusters: ${spokeNamesRef[*]}" + true +} + +ExtractSpokeKubeconfig() { + typeset targetSpokeName="${1:?}" + typeset spokeKubeconfigPath="${2:?}" + typeset adminKubeconfigSecretName + typeset managedClusterName + + if [[ -f "${SHARED_DIR}/managed-cluster-kubeconfig" && -f "${SHARED_DIR}/managed-cluster-name" ]]; then + managedClusterName="$(tr -d '[:space:]' < "${SHARED_DIR}/managed-cluster-name")" + if [[ "${managedClusterName}" == "${targetSpokeName}" ]]; then + cp "${SHARED_DIR}/managed-cluster-kubeconfig" "${spokeKubeconfigPath}" + : "Using cached kubeconfig from ${SHARED_DIR}/managed-cluster-kubeconfig for spoke '${targetSpokeName}'" + return 0 + fi + fi + + if ! oc -n "${targetSpokeName}" get "clusterdeployment/${targetSpokeName}" 1>/dev/null; then + echo "[ERROR] ClusterDeployment '${targetSpokeName}' not found on hub; cannot resolve admin kubeconfig" >&2 + return 1 + fi + + adminKubeconfigSecretName="$( + oc -n "${targetSpokeName}" get "clusterdeployment/${targetSpokeName}" \ + -o jsonpath='{.spec.clusterMetadata.adminKubeconfigSecretRef.name}' + )" + [[ -n "${adminKubeconfigSecretName}" ]] || { + echo "[ERROR] adminKubeconfigSecretRef is empty for spoke '${targetSpokeName}'" >&2 + return 1 + } + + oc -n "${targetSpokeName}" get "secret/${adminKubeconfigSecretName}" \ + -o jsonpath='{.data.kubeconfig}' | + base64 -d > "${spokeKubeconfigPath}" + + [[ -s "${spokeKubeconfigPath}" ]] || { + echo "[ERROR] Extracted kubeconfig for spoke '${targetSpokeName}' is empty" >&2 + return 1 + } + + true +} + +RunSpokeHealthcheck() { + typeset targetSpokeName="${1:?}" + typeset spokeKubeconfigPath="${2:?}" + + spokeName="${targetSpokeName}" + export KUBECONFIG="${spokeKubeconfigPath}" + trap SpokeHealthcheckFailureCleanup EXIT + + : "Post-upgrade health check for spoke '${spokeName}'" + + OC="run_command_oc" + + oc get machineconfig || true + + : "Step #1: Make sure no degraded or updating mcp" + wait_mcp_continous_success + + : "Step #2: check all cluster operators get stable and ready" + wait_clusteroperators_continous_success + + : "Step #3: Make sure every machine is in 'Ready' status" + check_node + + : "Step #4: check all pods are in status running or complete" + check_pod + + trap - EXIT + : "Post-upgrade health check passed for spoke '${spokeName}'" + true +} + +function run_command_oc() { + typeset -i try=0 max=40; typeset ret_val + + if [[ "$#" -lt 1 ]]; then + return 0 + fi + + while (( try < max )); do + if ret_val=$(oc "$@" 2>&1); then + break + fi + (( try += 1 )) + sleep 3 + done + + if (( try == max )); then + echo >&2 "Run:[oc $*]" + echo >&2 "Get:[$ret_val]" + return 255 + fi + + echo "${ret_val}" +} + +function check_clusteroperators() { + typeset -i tmp_ret=0; typeset tmp_clusteroperator input column last_column_name tmp_clusteroperator_1 rc null_version unavailable_operator degraded_operator + + : "Make sure every operator does not report empty column" + tmp_clusteroperator=$(mktemp /tmp/health_check-script.XXXXXX) + input="${tmp_clusteroperator}" + ${OC} get clusteroperator >"${tmp_clusteroperator}" + column=$(head -n 1 "${tmp_clusteroperator}" | awk '{print NF}') + last_column_name=$(head -n 1 "${tmp_clusteroperator}" | awk '{print $NF}') + if [[ ${last_column_name} == "MESSAGE" ]]; then + (( column -= 1 )) + tmp_clusteroperator_1=$(mktemp /tmp/health_check-script.XXXXXX) + awk -v end=${column} '{for(i=1;i<=end;i++) printf $i"\t"; print ""}' "${tmp_clusteroperator}" > "${tmp_clusteroperator_1}" + input="${tmp_clusteroperator_1}" + fi + + while IFS= read -r line + do + rc=$(echo "${line}" | awk '{print NF}') + if (( rc != column )); then + echo >&2 "The following line have empty column" + echo >&2 "${line}" + (( tmp_ret += 1 )) + fi + done < "${input}" + rm -f "${tmp_clusteroperator}" + + : "Make sure every operator column reports version" + if null_version=$(${OC} get clusteroperator -o json | jq '.items[] | select(.status.versions == null) | .metadata.name') && [[ ${null_version} != "" ]]; then + echo >&2 "Null Version: ${null_version}" + (( tmp_ret += 1 )) + fi + + : "Make sure every operator's AVAILABLE column is True" + if unavailable_operator=$(${OC} get clusteroperator | awk '$3 == "False"' | grep "False"); then + echo >&2 "Some operator's AVAILABLE is False" + echo >&2 "$unavailable_operator" + (( tmp_ret += 1 )) + fi + if ${OC} get clusteroperator -o jsonpath='{.items[].status.conditions[?(@.type=="Available")].status}'| grep -iv "True"; then + echo >&2 "Some operators are unavailable, pls run 'oc get clusteroperator -o json' to check" + (( tmp_ret += 1 )) + fi + + : "Make sure every operator's PROGRESSING column is False" + if progressing_operator=$(${OC} get clusteroperator | awk '$4 == "True"' | grep "True"); then + echo >&2 "Some operator's PROGRESSING is True" + echo >&2 "$progressing_operator" + (( tmp_ret += 1 )) + fi + if ${OC} get clusteroperator -o json | jq '.items[].status.conditions[] | select(.type == "Progressing") | .status' | grep -iv "False"; then + echo >&2 "Some operators are Progressing, pls run 'oc get clusteroperator -o json' to check" + (( tmp_ret += 1 )) + fi + + : "Make sure every operator's DEGRADED column is False" + # In disconnected install, openshift-sample often get into Degrade state, so it is better to remove them from cluster from flexy post-action + #degraded_operator=$(${OC} get clusteroperator | grep -v "openshift-sample" | awk '$5 == "True"') + if degraded_operator=$(${OC} get clusteroperator | awk '$5 == "True"' | grep "True"); then + echo >&2 "Some operator's DEGRADED is True" + echo >&2 "$degraded_operator" + (( tmp_ret += 1 )) + fi + #co_check=$(${OC} get clusteroperator -o json | jq '.items[] | select(.metadata.name != "openshift-samples") | .status.conditions[] | select(.type == "Degraded") | .status' | grep -iv 'False') + if ${OC} get clusteroperator -o jsonpath='{.items[].status.conditions[?(@.type=="Degraded")].status}'| grep -iv 'False'; then + echo >&2 "Some operators are Degraded, pls run 'oc get clusteroperator -o json' to check" + (( tmp_ret += 1 )) + fi + + return "${tmp_ret}" +} + +function wait_clusteroperators_continous_success() { + typeset -i continuousSuccessfulCheck=0 passedCriteria=3 + typeset -i wMax=$(( ACM_SPOKE_CO_READY_TIMEOUT_MINUTES * 60 )) wInt=60 + SECONDS=0 + while (( SECONDS < wMax && continuousSuccessfulCheck < passedCriteria )); do + : "Checking CO status (${SECONDS}/${wMax}s, consecutive pass ${continuousSuccessfulCheck}/${passedCriteria})" + if check_clusteroperators; then + (( continuousSuccessfulCheck += 1 )) + else + : "cluster operators not ready yet, waiting (${SECONDS}/${wMax}s)" + continuousSuccessfulCheck=0 + fi + sleep "${wInt}" + done + if (( continuousSuccessfulCheck < passedCriteria )); then + echo >&2 "Some cluster operator does not get ready or not stable" + oc get co + return 1 + fi + : "All cluster operators status check PASSED" + true +} + +function check_mcp() { + typeset updating_mcp unhealthy_mcp tmp_output unhealthy_mcp_names mcp_name + + tmp_output=$(mktemp) + oc get machineconfigpools -o custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?\(@.type==\"Updating\"\)].status --no-headers > "${tmp_output}" || true + if [[ -s "${tmp_output}" ]]; then + updating_mcp="$(grep -v "False" "${tmp_output}" || true)" + if [[ -n "${updating_mcp}" ]]; then + : "Some mcp is updating" + echo "${updating_mcp}" + rm -f "${tmp_output}" + return 1 + fi + else + : "Did not run 'oc get machineconfigpools' successfully" + rm -f "${tmp_output}" + return 1 + fi + + # Do not check UPDATED on purpose, because some paused mcp would not update itself until unpaused + oc get machineconfigpools -o custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?\(@.type==\"Updating\"\)].status,DEGRADED:status.conditions[?\(@.type==\"Degraded\"\)].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount --no-headers > "${tmp_output}" || true + if [[ -s "${tmp_output}" ]]; then + unhealthy_mcp="$(grep -v 'False.*False.*0' "${tmp_output}" || true)" + if [[ -n "${unhealthy_mcp}" ]]; then + : "Detected unhealthy mcp" + echo "${unhealthy_mcp}" + : "Real-time detected unhealthy mcp" + oc get machineconfigpools -o custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?\(@.type==\"Updating\"\)].status,DEGRADED:status.conditions[?\(@.type==\"Degraded\"\)].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount | grep -v 'False.*False.*0' || true + : "Real-time full mcp output" + oc get machineconfigpools + unhealthy_mcp_names=$(echo "${unhealthy_mcp}" | awk '{print $1}') + : "Using oc describe to check status of unhealthy mcp" + for mcp_name in ${unhealthy_mcp_names}; do + : "Name: ${mcp_name}" + oc describe mcp "${mcp_name}" || echo >&2 "oc describe mcp ${mcp_name} failed" + done + rm -f "${tmp_output}" + return 2 + fi + else + : "Did not run 'oc get machineconfigpools' successfully" + rm -f "${tmp_output}" + return 1 + fi + rm -f "${tmp_output}" + return 0 +} + +function wait_mcp_continous_success() { + typeset -i nodeCount nodeMcpMinutes totalMcpMinutes wMax wInt=30 + typeset -i continuousSuccessfulCheck=0 passedCriteria=10 # 5 min × 60 s ÷ 30 s interval + typeset -i continuousDegradedCheck=0 degradedCriteria=5 + typeset -i ret=0 + nodeCount="$(oc get node -o json | jq '.items | length')" + # Spoke post-upgrade MCP budget: max(floor, nodes × minutes-per-node). Cucushift default is 20m/node + # with no floor; raised after rehearsal #2065916519451201536 exhausted ~196m on a ~6-node spoke. + nodeMcpMinutes=$(( nodeCount * ACM_SPOKE_MCP_MINUTES_PER_NODE )) + if (( nodeMcpMinutes > ACM_SPOKE_MCP_READY_TIMEOUT_MINUTES )); then + totalMcpMinutes=$nodeMcpMinutes + else + totalMcpMinutes=$ACM_SPOKE_MCP_READY_TIMEOUT_MINUTES + fi + wMax=$(( totalMcpMinutes * 60 )) + SECONDS=0 + while (( SECONDS < wMax && continuousSuccessfulCheck < passedCriteria )); do + : "Checking MCP status (${SECONDS}/${wMax}s, consecutive pass ${continuousSuccessfulCheck}/${passedCriteria})" + ret=0 + check_mcp || ret=$? + if [[ "${ret}" == "0" ]]; then + continuousDegradedCheck=0 + (( continuousSuccessfulCheck += 1 )) + elif [[ "${ret}" == "1" ]]; then + : "Some machines are updating, waiting (${SECONDS}/${wMax}s)" + continuousSuccessfulCheck=0 + continuousDegradedCheck=0 + else + continuousSuccessfulCheck=0 + : "Some machines are degraded (${continuousDegradedCheck}/${degradedCriteria}), waiting (${SECONDS}/${wMax}s)" + (( continuousDegradedCheck += 1 )) + if (( continuousDegradedCheck >= degradedCriteria )); then + break + fi + fi + sleep "${wInt}" + done + if (( continuousSuccessfulCheck < passedCriteria )); then + echo >&2 "Some mcp does not get ready or not stable" + oc get machineconfigpools + return 1 + fi + : "All mcp status check PASSED" + true +} + +function check_node() { + typeset -i nodeNumber readyNumber + nodeNumber="$( + oc get node \ + -o jsonpath-as-json='{.items[*].metadata.name}' | + jq 'length' + )" + readyNumber="$( + oc get node -o json | + jq '[.items[] | select(.status.conditions[]? | .type == "Ready" and .status == "True")] | length' + )" + if (( nodeNumber == readyNumber )); then + : "All nodes status check PASSED" + return 0 + fi + if (( readyNumber == 0 )); then + echo >&2 "No any ready node" + else + echo >&2 "We found failed node" + oc get node -o wide + fi + return 1 +} + +function check_pod() { + : "Show all pods status for reference/debug" + oc get pods --all-namespaces + true +} + +# Setup proxy if it's present in the shared dir +if test -f "${SHARED_DIR}/proxy-conf.sh" +then + # shellcheck disable=SC1091 + source "${SHARED_DIR}/proxy-conf.sh" +fi + +export KUBECONFIG="${hubKubeconfig}" +DiscoverSpokeClusters spokeNamesArr + +for spokeName in "${spokeNamesArr[@]}"; do + export KUBECONFIG="${hubKubeconfig}" + spokeName="$(echo -n "${spokeName}" | xargs)" + typeset spokeKubeconfigFile + spokeKubeconfigFile="$(mktemp /tmp/acm-spoke-healthcheck.XXXXXX.kubeconfig)" + + if ! ExtractSpokeKubeconfig "${spokeName}" "${spokeKubeconfigFile}"; then + failedSpokesArr+=("${spokeName}") + rm -f "${spokeKubeconfigFile}" + continue + fi + + if ! RunSpokeHealthcheck "${spokeName}" "${spokeKubeconfigFile}"; then + failedSpokesArr+=("${spokeName}") + fi + + rm -f "${spokeKubeconfigFile}" +done + +export KUBECONFIG="${hubKubeconfig}" + +if [[ ${#failedSpokesArr[@]} -gt 0 ]]; then + echo "[ERROR] Post-upgrade health check failed for spoke cluster(s): ${failedSpokesArr[*]}" >&2 + exit 1 +fi + +: "Post-upgrade health check passed for all spoke cluster(s): ${spokeNamesArr[*]}" +true diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.metadata.json b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.metadata.json new file mode 100644 index 0000000000000..22db751b1b169 --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.metadata.json @@ -0,0 +1,11 @@ +{ + "path": "acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.yaml", + "owners": { + "approvers": [ + "cspi-qe-ocp-lp" + ], + "reviewers": [ + "cspi-qe-ocp-lp" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.yaml b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.yaml new file mode 100644 index 0000000000000..e6879c49d39d7 --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-healthcheck/acm-interop-p2p-spoke-upgrade-healthcheck-ref.yaml @@ -0,0 +1,37 @@ +ref: + as: acm-interop-p2p-spoke-upgrade-healthcheck + from_image: + namespace: ci + name: verification-tests + tag: latest + grace_period: 10m + commands: acm-interop-p2p-spoke-upgrade-healthcheck-commands.sh + timeout: 5h0m0s + cli: latest + resources: + limits: + cpu: "1" + requests: + cpu: 100m + memory: 100Mi + env: + - name: ACM_INTEROP_P2P__HEALTHCHECK__SPOKE_CLUSTERS + default: "" + documentation: Optional comma-separated ManagedCluster names. When empty, all managed spokes except local-cluster are checked. + - name: ACM_SPOKE_MCP_MINUTES_PER_NODE + default: "35" + documentation: Minutes of MCP wait budget per node (cucushift default 20). Rehearsal #2065916519451201536 needed ~33m/node on a 6-node spoke. + - name: ACM_SPOKE_MCP_READY_TIMEOUT_MINUTES + default: "210" + documentation: Minimum MCP wait budget in minutes regardless of node count. Floor above rehearsal failure (~196m) before cucushift 20m/node cap (120m on 6 nodes). + - name: ACM_SPOKE_CO_READY_TIMEOUT_MINUTES + default: "45" + documentation: Cluster operator stability wait budget in minutes (cucushift default 30). Poll interval is 60s. + documentation: |- + Post-upgrade health check on ACM managed spoke cluster(s). Uses the same logic as + cucushift-upgrade-healthcheck per spoke (MCP, cluster operators, nodes, pods). + Discovers spokes from the hub or from ACM_INTEROP_P2P__HEALTHCHECK__SPOKE_CLUSTERS. + Reuses ${SHARED_DIR}/managed-cluster-kubeconfig when the spoke name matches + ${SHARED_DIR}/managed-cluster-name. Step timeout (5h) covers sequential checks; + increase job timeout when many spokes are registered. + On failure, writes spoke--upgrade-healthcheck-failure.txt to ${ARTIFACT_DIR}. diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/OWNERS b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/OWNERS new file mode 100644 index 0000000000000..41d144d3728a2 --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/OWNERS @@ -0,0 +1,3 @@ +approvers: &owners +- cspi-qe-ocp-lp +reviewers: *owners diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/README.md b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/README.md new file mode 100644 index 0000000000000..dab5fedf5141f --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/README.md @@ -0,0 +1,31 @@ +# ACM Interop P2P Spoke Upgrade Pre-healthcheck + +## Purpose + +Run cucushift-style pre-upgrade health checks against every ACM managed spoke cluster +on the hub before `acm-interop-p2p-spoke-upgrade`. + +## Process + +1. Connect to the ACM hub using `${SHARED_DIR}/kubeconfig`. +2. Resolve the spoke list from `ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS` when set, + or discover all `ManagedCluster` resources except `local-cluster`. +3. For each spoke, resolve an admin kubeconfig from `${SHARED_DIR}/managed-cluster-kubeconfig` + (when the spoke name matches `${SHARED_DIR}/managed-cluster-name`) or from the Hive + `ClusterDeployment` admin kubeconfig secret. +4. Run MCP, ClusterOperator, node, and pod checks on each spoke (same logic as + `cucushift-upgrade-prehealthcheck`). +5. On per-spoke failure, write `spoke--upgrade-prehealthcheck-failure.txt` to + `${ARTIFACT_DIR}`. + +## Environment Variables + +| Name | Default | Description | +| --- | --- | --- | +| `ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS` | empty | Optional comma-separated spoke names. When empty, all managed spokes are checked. | + +## Requirements + +- Hub kubeconfig at `${SHARED_DIR}/kubeconfig` +- At least one managed spoke registered with ACM +- Hive-provisioned spokes must expose `ClusterDeployment.spec.clusterMetadata.adminKubeconfigSecretRef` diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-commands.sh b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-commands.sh new file mode 100755 index 0000000000000..e12e9360dbf18 --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-commands.sh @@ -0,0 +1,454 @@ +#!/bin/bash +# +# Pre-upgrade health check on ACM managed spoke cluster(s). +# Health check logic mirrors cucushift-upgrade-prehealthcheck-commands.sh +# (ci-operator/step-registry/cucushift/upgrade/prehealthcheck/) run per spoke kubeconfig. +# +set -euxo pipefail; shopt -s inherit_errexit + +typeset hubKubeconfig="${SHARED_DIR}/kubeconfig" +typeset spokeName='spoke' +typeset -a spokeNamesArr=() +typeset -a failedSpokesArr=() + +[[ -f "${hubKubeconfig}" ]] || { + echo "[ERROR] Hub kubeconfig not found: ${hubKubeconfig}" >&2 + exit 1 +} + +WriteSpokePrehealthcheckFailureDiagnostics() { + typeset artifactFile="${ARTIFACT_DIR}/spoke-${spokeName}-upgrade-prehealthcheck-failure.txt" + typeset unhealthyMcp mcpName nodeName coName + + { + echo "=== oc get clusterversion ===" + oc get clusterversion version -o wide 2>&1 || true + echo + echo "=== oc describe clusterversion version ===" + oc describe clusterversion version 2>&1 || true + echo + echo "=== oc get machineconfigpools ===" + oc get machineconfigpools 2>&1 || true + echo + echo "=== MCP custom-columns (UPDATING/DEGRADED) ===" + oc get machineconfigpools \ + -o 'custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?(@.type=="Updating")].status,DEGRADED:status.conditions[?(@.type=="Degraded")].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount' \ + 2>&1 || true + unhealthyMcp="$(oc get machineconfigpools \ + -o 'custom-columns=NAME:metadata.name,UPDATING:status.conditions[?(@.type=="Updating")].status,DEGRADED:status.conditions[?(@.type=="Degraded")].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount' \ + --no-headers 2>/dev/null | grep -Ev '[[:space:]]False[[:space:]]+False[[:space:]]+0[[:space:]]*$' || true)" + if [[ -n "${unhealthyMcp}" ]]; then + echo + echo "=== oc describe unhealthy MCPs ===" + while read -r mcpName _; do + [[ -n "${mcpName}" ]] || continue + echo "--- ${mcpName} ---" + oc describe machineconfigpool "${mcpName}" 2>&1 || true + done <<<"${unhealthyMcp}" + fi + echo + echo "=== oc get nodes ===" + oc get nodes -o wide 2>&1 || true + echo + echo "=== oc describe not-Ready nodes ===" + while read -r nodeName _; do + [[ -n "${nodeName}" ]] || continue + echo "--- ${nodeName} ---" + oc describe node "${nodeName}" 2>&1 || true + done < <(oc get nodes --no-headers 2>/dev/null | awk '$2 != "Ready" {print $1}' || true) + echo + echo "=== oc get clusteroperators ===" + oc get clusteroperators 2>&1 || true + echo + echo "=== oc describe unhealthy clusteroperators ===" + while read -r coName _; do + [[ -n "${coName}" ]] || continue + echo "--- ${coName} ---" + oc describe clusteroperator "${coName}" 2>&1 || true + done < <(oc get clusteroperators --no-headers 2>/dev/null | awk '$3 == "False" || $4 == "True" || $5 == "True" {print $1}' || true) + echo + echo "=== oc get pods -n openshift-machine-config-operator ===" + oc get pods -n openshift-machine-config-operator -o wide 2>&1 || true + } > "${artifactFile}" + : "Wrote spoke upgrade prehealthcheck diagnostics to ${artifactFile}" + true +} + +SpokePrehealthcheckFailureCleanup() { + typeset ret=$? + if (( ret != 0 )); then + WriteSpokePrehealthcheckFailureDiagnostics || true + fi + return "${ret}" +} + +DiscoverSpokeClusters() { + typeset -n spokeNamesRef="${1:?}" + typeset -a rawSpokeNamesArr=() + typeset spokeClusterName + + spokeNamesRef=() + if [[ -n "${ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS:-}" ]]; then + IFS=',' read -r -a rawSpokeNamesArr <<< "${ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS}" + for spokeClusterName in "${rawSpokeNamesArr[@]}"; do + spokeClusterName="$(echo -n "${spokeClusterName}" | xargs)" + [[ -n "${spokeClusterName}" ]] || { + echo "[ERROR] Empty spoke name in ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS" >&2 + return 1 + } + spokeNamesRef+=("${spokeClusterName}") + done + : "Using spoke list from ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS: ${spokeNamesRef[*]}" + return 0 + fi + + mapfile -t spokeNamesRef < <( + oc get managedcluster \ + -o jsonpath-as-json='{.items[*].metadata.name}' | + jq -r '.[] | select(. != "local-cluster")' + ) + if [[ ${#spokeNamesRef[@]} -eq 0 ]]; then + echo "[ERROR] No managed spoke clusters found on hub" >&2 + return 1 + fi + + : "Discovered managed spoke clusters: ${spokeNamesRef[*]}" + true +} + +ExtractSpokeKubeconfig() { + typeset targetSpokeName="${1:?}" + typeset spokeKubeconfigPath="${2:?}" + typeset adminKubeconfigSecretName + typeset managedClusterName + + if [[ -f "${SHARED_DIR}/managed-cluster-kubeconfig" && -f "${SHARED_DIR}/managed-cluster-name" ]]; then + managedClusterName="$(tr -d '[:space:]' < "${SHARED_DIR}/managed-cluster-name")" + if [[ "${managedClusterName}" == "${targetSpokeName}" ]]; then + cp "${SHARED_DIR}/managed-cluster-kubeconfig" "${spokeKubeconfigPath}" + : "Using cached kubeconfig from ${SHARED_DIR}/managed-cluster-kubeconfig for spoke '${targetSpokeName}'" + return 0 + fi + fi + + if ! oc -n "${targetSpokeName}" get "clusterdeployment/${targetSpokeName}" 1>/dev/null; then + echo "[ERROR] ClusterDeployment '${targetSpokeName}' not found on hub; cannot resolve admin kubeconfig" >&2 + return 1 + fi + + adminKubeconfigSecretName="$( + oc -n "${targetSpokeName}" get "clusterdeployment/${targetSpokeName}" \ + -o jsonpath='{.spec.clusterMetadata.adminKubeconfigSecretRef.name}' + )" + [[ -n "${adminKubeconfigSecretName}" ]] || { + echo "[ERROR] adminKubeconfigSecretRef is empty for spoke '${targetSpokeName}'" >&2 + return 1 + } + + oc -n "${targetSpokeName}" get "secret/${adminKubeconfigSecretName}" \ + -o jsonpath='{.data.kubeconfig}' | + base64 -d > "${spokeKubeconfigPath}" + + [[ -s "${spokeKubeconfigPath}" ]] || { + echo "[ERROR] Extracted kubeconfig for spoke '${targetSpokeName}' is empty" >&2 + return 1 + } + + true +} + +RunSpokePrehealthcheck() { + typeset targetSpokeName="${1:?}" + typeset spokeKubeconfigPath="${2:?}" + + spokeName="${targetSpokeName}" + export KUBECONFIG="${spokeKubeconfigPath}" + trap SpokePrehealthcheckFailureCleanup EXIT + + : "Pre-upgrade health check for spoke '${spokeName}'" + + OC="run_command_oc" + + oc get machineconfig + + : "Step #1: Make sure no degraded or updating mcp" + wait_mcp_continous_success + + : "Step #2: check all cluster operators get stable and ready" + wait_clusteroperators_continous_success + + : "Step #3: Make sure every machine is in 'Ready' status" + check_node + + : "Step #4: check all pods are in status running or complete" + check_pod + + trap - EXIT + : "Pre-upgrade health check passed for spoke '${spokeName}'" + true +} + +function run_command_oc() { + typeset -i try=0 max=40; typeset ret_val + + if [[ "$#" -lt 1 ]]; then + return 0 + fi + + while (( try < max )); do + if ret_val=$(oc "$@" 2>&1); then + break + fi + (( try += 1 )) + sleep 3 + done + + if (( try == max )); then + echo >&2 "Run:[oc $*]" + echo >&2 "Get:[$ret_val]" + return 255 + fi + + echo "${ret_val}" +} + +function check_clusteroperators() { + typeset -i tmp_ret=0; typeset tmp_clusteroperator input column last_column_name tmp_clusteroperator_1 rc null_version unavailable_operator degraded_operator + + : "Make sure every operator does not report empty column" + tmp_clusteroperator=$(mktemp /tmp/health_check-script.XXXXXX) + input="${tmp_clusteroperator}" + ${OC} get clusteroperator >"${tmp_clusteroperator}" + column=$(head -n 1 "${tmp_clusteroperator}" | awk '{print NF}') + last_column_name=$(head -n 1 "${tmp_clusteroperator}" | awk '{print $NF}') + if [[ ${last_column_name} == "MESSAGE" ]]; then + (( column -= 1 )) + tmp_clusteroperator_1=$(mktemp /tmp/health_check-script.XXXXXX) + awk -v end=${column} '{for(i=1;i<=end;i++) printf $i"\t"; print ""}' "${tmp_clusteroperator}" > "${tmp_clusteroperator_1}" + input="${tmp_clusteroperator_1}" + fi + + while IFS= read -r line + do + rc=$(echo "${line}" | awk '{print NF}') + if (( rc != column )); then + echo >&2 "The following line have empty column" + echo >&2 "${line}" + (( tmp_ret += 1 )) + fi + done < "${input}" + rm -f "${tmp_clusteroperator}" + + : "Make sure every operator column reports version" + if null_version=$(${OC} get clusteroperator -o json | jq '.items[] | select(.status.versions == null) | .metadata.name') && [[ ${null_version} != "" ]]; then + echo >&2 "Null Version: ${null_version}" + (( tmp_ret += 1 )) + fi + + : "Make sure every operator's AVAILABLE column is True" + if unavailable_operator=$(${OC} get clusteroperator | awk '$3 == "False"' | grep "False"); then + echo >&2 "Some operator's AVAILABLE is False" + echo >&2 "$unavailable_operator" + (( tmp_ret += 1 )) + fi + if ${OC} get clusteroperator -o jsonpath='{.items[].status.conditions[?(@.type=="Available")].status}'| grep -iv "True"; then + echo >&2 "Some operators are unavailable, pls run 'oc get clusteroperator -o json' to check" + (( tmp_ret += 1 )) + fi + + : "Make sure every operator's PROGRESSING column is False" + if progressing_operator=$(${OC} get clusteroperator | awk '$4 == "True"' | grep "True"); then + echo >&2 "Some operator's PROGRESSING is True" + echo >&2 "$progressing_operator" + (( tmp_ret += 1 )) + fi + if ${OC} get clusteroperator -o json | jq '.items[].status.conditions[] | select(.type == "Progressing") | .status' | grep -iv "False"; then + echo >&2 "Some operators are Progressing, pls run 'oc get clusteroperator -o json' to check" + (( tmp_ret += 1 )) + fi + + : "Make sure every operator's DEGRADED column is False" + if degraded_operator=$(${OC} get clusteroperator | awk '$5 == "True"' | grep "True"); then + echo >&2 "Some operator's DEGRADED is True" + echo >&2 "$degraded_operator" + (( tmp_ret += 1 )) + fi + if ${OC} get clusteroperator -o jsonpath='{.items[].status.conditions[?(@.type=="Degraded")].status}'| grep -iv 'False'; then + echo >&2 "Some operators are Degraded, pls run 'oc get clusteroperator -o json' to check" + (( tmp_ret += 1 )) + fi + + return "${tmp_ret}" +} + +function wait_clusteroperators_continous_success() { + typeset -i continuousSuccessfulCheck=0 passedCriteria=3 + typeset -i wMax=1800 wInt=60 # 30 min (30 iterations × 60 s) + SECONDS=0 + while (( SECONDS < wMax && continuousSuccessfulCheck < passedCriteria )); do + : "Checking CO status (${SECONDS}/${wMax}s, consecutive pass ${continuousSuccessfulCheck}/${passedCriteria})" + if check_clusteroperators; then + (( continuousSuccessfulCheck += 1 )) + else + : "cluster operators not ready yet, waiting (${SECONDS}/${wMax}s)" + continuousSuccessfulCheck=0 + fi + sleep "${wInt}" + done + if (( continuousSuccessfulCheck < passedCriteria )); then + echo >&2 "Some cluster operator does not get ready or not stable" + oc get co + return 1 + fi + : "All cluster operators status check PASSED" + true +} + +function check_mcp() { + typeset updating_mcp unhealthy_mcp tmp_output unhealthy_mcp_names mcp_name + + tmp_output=$(mktemp) + oc get machineconfigpools -o custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?\(@.type==\"Updating\"\)].status --no-headers > "${tmp_output}" || true + if [[ -s "${tmp_output}" ]]; then + updating_mcp="$(grep -v "False" "${tmp_output}" || true)" + if [[ -n "${updating_mcp}" ]]; then + : "Some mcp is updating" + echo "${updating_mcp}" + rm -f "${tmp_output}" + return 1 + fi + else + : "Did not run 'oc get machineconfigpools' successfully" + rm -f "${tmp_output}" + return 1 + fi + + oc get machineconfigpools -o custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?\(@.type==\"Updating\"\)].status,DEGRADED:status.conditions[?\(@.type==\"Degraded\"\)].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount --no-headers > "${tmp_output}" || true + if [[ -s "${tmp_output}" ]]; then + unhealthy_mcp="$(grep -v 'False.*False.*0' "${tmp_output}" || true)" + if [[ -n "${unhealthy_mcp}" ]]; then + : "Detected unhealthy mcp" + echo "${unhealthy_mcp}" + oc get machineconfigpools -o custom-columns=NAME:metadata.name,CONFIG:spec.configuration.name,UPDATING:status.conditions[?\(@.type==\"Updating\"\)].status,DEGRADED:status.conditions[?\(@.type==\"Degraded\"\)].status,DEGRADEDMACHINECOUNT:status.degradedMachineCount | grep -v 'False.*False.*0' || true + oc get machineconfigpools + unhealthy_mcp_names=$(echo "${unhealthy_mcp}" | awk '{print $1}') + for mcp_name in ${unhealthy_mcp_names}; do + : "Name: ${mcp_name}" + oc describe mcp "${mcp_name}" || echo >&2 "oc describe mcp ${mcp_name} failed" + done + rm -f "${tmp_output}" + return 2 + fi + else + : "Did not run 'oc get machineconfigpools' successfully" + rm -f "${tmp_output}" + return 1 + fi + rm -f "${tmp_output}" + return 0 +} + +function wait_mcp_continous_success() { + typeset -i nodeCount wMax wInt=30 + typeset -i continuousSuccessfulCheck=0 passedCriteria=10 # 5 min × 60 s ÷ 30 s interval + typeset -i continuousDegradedCheck=0 degradedCriteria=5 + typeset -i ret=0 + nodeCount="$(oc get node -o json | jq '.items | length')" + wMax=$(( nodeCount * 20 * 60 )) # nodes × 20 min × 60 s + SECONDS=0 + while (( SECONDS < wMax && continuousSuccessfulCheck < passedCriteria )); do + : "Checking MCP status (${SECONDS}/${wMax}s, consecutive pass ${continuousSuccessfulCheck}/${passedCriteria})" + ret=0 + check_mcp || ret=$? + if [[ "${ret}" == "0" ]]; then + continuousDegradedCheck=0 + (( continuousSuccessfulCheck += 1 )) + elif [[ "${ret}" == "1" ]]; then + : "Some machines are updating, waiting (${SECONDS}/${wMax}s)" + continuousSuccessfulCheck=0 + continuousDegradedCheck=0 + else + continuousSuccessfulCheck=0 + : "Some machines are degraded (${continuousDegradedCheck}/${degradedCriteria}), waiting (${SECONDS}/${wMax}s)" + (( continuousDegradedCheck += 1 )) + if (( continuousDegradedCheck >= degradedCriteria )); then + break + fi + fi + sleep "${wInt}" + done + if (( continuousSuccessfulCheck < passedCriteria )); then + echo >&2 "Some mcp does not get ready or not stable" + oc get machineconfigpools + return 1 + fi + : "All mcp status check PASSED" + true +} + +function check_node() { + typeset -i nodeNumber readyNumber + nodeNumber="$( + oc get node \ + -o jsonpath-as-json='{.items[*].metadata.name}' | + jq 'length' + )" + readyNumber="$( + oc get node -o json | + jq '[.items[] | select(.status.conditions[]? | .type == "Ready" and .status == "True")] | length' + )" + if (( nodeNumber == readyNumber )); then + : "All nodes status check PASSED" + return 0 + fi + if (( readyNumber == 0 )); then + echo >&2 "No any ready node" + else + echo >&2 "We found failed node" + oc get node -o wide + fi + return 1 +} + +function check_pod() { + : "Show all pods status for reference/debug" + oc get pods --all-namespaces + true +} + +if test -f "${SHARED_DIR}/proxy-conf.sh"; then + # shellcheck disable=SC1091 + source "${SHARED_DIR}/proxy-conf.sh" +fi + +export KUBECONFIG="${hubKubeconfig}" +DiscoverSpokeClusters spokeNamesArr + +for spokeName in "${spokeNamesArr[@]}"; do + export KUBECONFIG="${hubKubeconfig}" + spokeName="$(echo -n "${spokeName}" | xargs)" + typeset spokeKubeconfigFile + spokeKubeconfigFile="$(mktemp /tmp/acm-spoke-prehealthcheck.XXXXXX.kubeconfig)" + + if ! ExtractSpokeKubeconfig "${spokeName}" "${spokeKubeconfigFile}"; then + failedSpokesArr+=("${spokeName}") + rm -f "${spokeKubeconfigFile}" + continue + fi + + if ! RunSpokePrehealthcheck "${spokeName}" "${spokeKubeconfigFile}"; then + failedSpokesArr+=("${spokeName}") + fi + + rm -f "${spokeKubeconfigFile}" +done + +export KUBECONFIG="${hubKubeconfig}" + +if [[ ${#failedSpokesArr[@]} -gt 0 ]]; then + echo "[ERROR] Pre-upgrade health check failed for spoke cluster(s): ${failedSpokesArr[*]}" >&2 + exit 1 +fi + +: "Pre-upgrade health check passed for all spoke cluster(s): ${spokeNamesArr[*]}" +true diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.metadata.json b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.metadata.json new file mode 100644 index 0000000000000..a7518251af244 --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.metadata.json @@ -0,0 +1,11 @@ +{ + "path": "acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.yaml", + "owners": { + "approvers": [ + "cspi-qe-ocp-lp" + ], + "reviewers": [ + "cspi-qe-ocp-lp" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.yaml b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.yaml new file mode 100644 index 0000000000000..e8dbd9e5a7395 --- /dev/null +++ b/ci-operator/step-registry/acm/interop-p2p/spoke-upgrade-prehealthcheck/acm-interop-p2p-spoke-upgrade-prehealthcheck-ref.yaml @@ -0,0 +1,28 @@ +ref: + as: acm-interop-p2p-spoke-upgrade-prehealthcheck + from_image: + namespace: ci + name: verification-tests + tag: latest + grace_period: 10m + commands: acm-interop-p2p-spoke-upgrade-prehealthcheck-commands.sh + timeout: 2h0m0s + cli: latest + resources: + limits: + cpu: "1" + requests: + cpu: 100m + memory: 100Mi + env: + - name: ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS + default: "" + documentation: Optional comma-separated ManagedCluster names. When empty, all managed spokes except local-cluster are checked. + documentation: |- + Pre-upgrade health check on ACM managed spoke cluster(s). Uses the same logic as + cucushift-upgrade-prehealthcheck per spoke (MCP not updating/degraded, cluster + operators stable, all nodes Ready). Discovers spokes from the hub or from + ACM_INTEROP_P2P__PREHEALTHCHECK__SPOKE_CLUSTERS. Reuses + ${SHARED_DIR}/managed-cluster-kubeconfig when the spoke name matches + ${SHARED_DIR}/managed-cluster-name. On failure, writes + spoke--upgrade-prehealthcheck-failure.txt to ${ARTIFACT_DIR}.