diff --git a/ci-operator/config/trusted-execution-clusters/OWNERS b/ci-operator/config/trusted-execution-clusters/OWNERS index 79b41a3630fd0..ff09338355592 100644 --- a/ci-operator/config/trusted-execution-clusters/OWNERS +++ b/ci-operator/config/trusted-execution-clusters/OWNERS @@ -1,10 +1,6 @@ reviewers: - alicefr - - yalzhang - Jakob-Naucke - - fangge1212 approvers: - alicefr - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/config/trusted-execution-clusters/operator/OWNERS b/ci-operator/config/trusted-execution-clusters/operator/OWNERS index 79b41a3630fd0..ff09338355592 100644 --- a/ci-operator/config/trusted-execution-clusters/operator/OWNERS +++ b/ci-operator/config/trusted-execution-clusters/operator/OWNERS @@ -1,10 +1,6 @@ reviewers: - alicefr - - yalzhang - Jakob-Naucke - - fangge1212 approvers: - alicefr - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/config/trusted-execution-clusters/operator/trusted-execution-clusters-operator-main.yaml b/ci-operator/config/trusted-execution-clusters/operator/trusted-execution-clusters-operator-main.yaml deleted file mode 100644 index 54aae9fc49d12..0000000000000 --- a/ci-operator/config/trusted-execution-clusters/operator/trusted-execution-clusters-operator-main.yaml +++ /dev/null @@ -1,32 +0,0 @@ -base_images: - telco-runner: - name: telco-runner - namespace: ci - tag: latest -build_root: - image_stream_tag: - name: builder - namespace: ocp - tag: rhel-9-golang-1.25-openshift-4.21 -resources: - '*': - limits: - memory: 4Gi - requests: - cpu: 100m - memory: 200Mi -tests: -- as: operator-lifecycle-verify - capabilities: - - intranet - skip_if_only_changed: ^(\.github|LICENSES|bundle|docs|examples)/|^(README\.md|\.gitignore)$ - steps: - post: - - chain: trusted-execution-clusters-operator-cleanup - test: - - chain: trusted-execution-clusters-operator-infra-provision - - chain: trusted-execution-clusters-operator-lifecycle -zz_generated_metadata: - branch: main - org: trusted-execution-clusters - repo: operator diff --git a/ci-operator/jobs/trusted-execution-clusters/operator/trusted-execution-clusters-operator-main-presubmits.yaml b/ci-operator/jobs/trusted-execution-clusters/operator/trusted-execution-clusters-operator-main-presubmits.yaml deleted file mode 100644 index 3f566a011df32..0000000000000 --- a/ci-operator/jobs/trusted-execution-clusters/operator/trusted-execution-clusters-operator-main-presubmits.yaml +++ /dev/null @@ -1,77 +0,0 @@ -presubmits: - trusted-execution-clusters/operator: - - agent: kubernetes - always_run: false - branches: - - ^main$ - - ^main- - cluster: build03 - context: ci/prow/operator-lifecycle-verify - decorate: true - decoration_config: - skip_cloning: true - labels: - capability/intranet: intranet - ci.openshift.io/generator: prowgen - pj-rehearse.openshift.io/can-be-rehearsed: "true" - name: pull-ci-trusted-execution-clusters-operator-main-operator-lifecycle-verify - rerun_command: /test operator-lifecycle-verify - skip_if_only_changed: ^(\.github|LICENSES|bundle|docs|examples)/|^(README\.md|\.gitignore)$ - spec: - containers: - - args: - - --gcs-upload-secret=/secrets/gcs/service-account.json - - --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson - - --lease-server-credentials-file=/etc/boskos/credentials - - --report-credentials-file=/etc/report/credentials - - --target=operator-lifecycle-verify - command: - - ci-operator - env: - - name: HTTP_SERVER_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - image: quay-proxy.ci.openshift.org/openshift/ci:ci_ci-operator_latest - imagePullPolicy: Always - name: "" - ports: - - containerPort: 8080 - name: http - resources: - requests: - cpu: 10m - volumeMounts: - - mountPath: /etc/boskos - name: boskos - readOnly: true - - mountPath: /secrets/gcs - name: gcs-credentials - readOnly: true - - mountPath: /secrets/manifest-tool - name: manifest-tool-local-pusher - readOnly: true - - mountPath: /etc/pull-secret - name: pull-secret - readOnly: true - - mountPath: /etc/report - name: result-aggregator - readOnly: true - serviceAccountName: ci-operator - volumes: - - name: boskos - secret: - items: - - key: credentials - path: credentials - secretName: boskos-credentials - - name: manifest-tool-local-pusher - secret: - secretName: manifest-tool-local-pusher - - name: pull-secret - secret: - secretName: registry-pull-credentials - - name: result-aggregator - secret: - secretName: result-aggregator - trigger: (?m)^/test( | .* )operator-lifecycle-verify,?($|\s.*) diff --git a/ci-operator/step-registry/trusted-execution-clusters/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/OWNERS index 79b41a3630fd0..ff09338355592 100644 --- a/ci-operator/step-registry/trusted-execution-clusters/OWNERS +++ b/ci-operator/step-registry/trusted-execution-clusters/OWNERS @@ -1,10 +1,6 @@ reviewers: - alicefr - - yalzhang - Jakob-Naucke - - fangge1212 approvers: - alicefr - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/operator/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/trusted-execution-clusters-operator-cleanup-chain.metadata.json b/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/trusted-execution-clusters-operator-cleanup-chain.metadata.json deleted file mode 100644 index d02e53dfbcda4..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/trusted-execution-clusters-operator-cleanup-chain.metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "path": "trusted-execution-clusters/operator/cleanup/trusted-execution-clusters-operator-cleanup-chain.yaml", - "owners": { - "approvers": [ - "alicefr", - "Jakob-Naucke", - "fangge1212", - "yalzhang" - ], - "reviewers": [ - "alicefr", - "yalzhang", - "Jakob-Naucke", - "fangge1212" - ] - } -} \ No newline at end of file diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/trusted-execution-clusters-operator-cleanup-chain.yaml b/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/trusted-execution-clusters-operator-cleanup-chain.yaml deleted file mode 100644 index fbb2be7142ad1..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/cleanup/trusted-execution-clusters-operator-cleanup-chain.yaml +++ /dev/null @@ -1,70 +0,0 @@ -chain: - as: trusted-execution-clusters-operator-cleanup - steps: - - ref: trusted-execution-clusters-ref-operator-beaker-deprovision - documentation: |- - This chain performs comprehensive cleanup of all resources created during testing - on the Beaker bare metal machine. - - The chain executes one step: - - Step 1 - Beaker Deprovision (trusted-execution-clusters-ref-operator-beaker-deprovision): - - Connects to Beaker machine via SSH - - Collects pre-cleanup system state for post-mortem analysis - - Archives important logs before deletion: - * Kind deployment logs - * Kind cluster logs - * Operator installation logs - - Performs cleanup operations: - * Deletes Kind cluster using 'kind delete cluster' - * Removes container images (localhost:5000/*, operator images) - * Prunes dangling images from docker - * Removes temporary directories (/tmp/kind-*, /tmp/operator-*) - * Removes operator working directories (~/operator-kind-setup) - - Optionally restarts container runtime services (docker) - - Collects post-cleanup system state for verification - - Generates cleanup summary report - - Cleanup Strategy: - - Best-effort cleanup: Individual operations may fail without causing the step to fail - - Ensures cleanup proceeds even if some resources are already gone - - Archives logs before deletion to preserve debugging information - - Restarts services to ensure clean state for future runs - - Prerequisites: - - SSH credentials must be available in Vault - - Beaker machine must be accessible - - Environment Variables: - - DEPROVISION_TIMEOUT: Timeout for cleanup operations (default: 600 seconds) - - CLEANUP_IMAGES: Whether to clean up container images (default: true) - - RESTART_SERVICES: Whether to restart services (default: true) - - Outputs: - - ${ARTIFACT_DIR}/cleanup-logs/pre-cleanup-state.log: System state before cleanup - - ${ARTIFACT_DIR}/cleanup-logs/post-cleanup-state.log: System state after cleanup - - ${ARTIFACT_DIR}/cleanup-logs/cleanup-execution.log: Detailed cleanup operation log - - ${ARTIFACT_DIR}/cleanup-logs/archived-logs/: Logs archived before deletion - - After successful completion: - - Kind cluster is deleted - - Container images are cleaned up (if CLEANUP_IMAGES=true) - - Temporary files and directories are removed - - Operator working directories are removed - - Container runtime services are restarted (if RESTART_SERVICES=true) - - Beaker machine is ready for next test run - - Exit Codes: - - 0: Cleanup completed successfully (best effort) - - 1: Critical failure (e.g., SSH connection issues) - - Usage Notes: - - This chain should typically be used in the 'post' phase of workflows - - It will execute even if previous steps failed (post always runs) - - Set CLEANUP_IMAGES=false to preserve images for debugging - - Set RESTART_SERVICES=false to skip service restarts - - Scope: - This chain is responsible for CLEANUP operations only. - It should run in the post phase to ensure resources are released - regardless of test success or failure. diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/trusted-execution-clusters-operator-infra-provision-chain.metadata.json b/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/trusted-execution-clusters-operator-infra-provision-chain.metadata.json deleted file mode 100644 index 8824bb38cf416..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/trusted-execution-clusters-operator-infra-provision-chain.metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "path": "trusted-execution-clusters/operator/infra-provision/trusted-execution-clusters-operator-infra-provision-chain.yaml", - "owners": { - "approvers": [ - "alicefr", - "Jakob-Naucke", - "fangge1212", - "yalzhang" - ], - "reviewers": [ - "alicefr", - "yalzhang", - "Jakob-Naucke", - "fangge1212" - ] - } -} \ No newline at end of file diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/trusted-execution-clusters-operator-infra-provision-chain.yaml b/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/trusted-execution-clusters-operator-infra-provision-chain.yaml deleted file mode 100644 index d7bc54aab3e4e..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/infra-provision/trusted-execution-clusters-operator-infra-provision-chain.yaml +++ /dev/null @@ -1,58 +0,0 @@ -chain: - as: trusted-execution-clusters-operator-infra-provision - steps: - - ref: trusted-execution-clusters-ref-operator-beaker-kind-provision - - ref: trusted-execution-clusters-ref-operator-kind-cluster-create - documentation: |- - This chain prepares the complete infrastructure for trusted-execution-clusters operator testing - on a Beaker bare metal machine. - - The chain executes two sequential steps: - - Step 1 - Beaker Environment Preparation (trusted-execution-clusters-ref-operator-beaker-kind-provision): - - Establishes SSH connection from CI pod to Beaker bare metal machine - - Installs all system dependencies: - * Docker CE (container runtime) - * kubectl v1.29.0 (Kubernetes CLI) - * kind v0.30.0 (Kubernetes in Docker) - * Rust (via rustup) with system-wide PATH configuration - * Go 1.25.0 (downloaded from official release, required for operator build) - * Build tools: make, gcc, jq - - Downloads operator repository (configurable via env vars) - - Applies required patches (PR #113, PR #119) - - Adapts kind cluster configuration for external access - - Verifies all tools are correctly installed - - Step 2 - Kind Cluster Creation (trusted-execution-clusters-ref-operator-kind-cluster-create): - - Executes 'make cluster-up' on Beaker machine to create Kind cluster - - Sets up local container registry (localhost:5000) - - Retrieves kubeconfig and saves to ${SHARED_DIR}/kubeconfig - - Verifies cluster is accessible from both Beaker machine and CI pod - - Collects cluster creation logs to ${ARTIFACT_DIR}/kind-cluster-logs/ - - Prerequisites: - - Beaker bare metal machine pre-provisioned with Fedora - - SSH public key pre-configured on Beaker machine (~/.ssh/authorized_keys) - - Vault secret configured at: secrets/kv/selfservice/confidential-qe/beaker-bm - Required fields: - - beaker-ssh-private-key: SSH private key (matching public key on Beaker) - - beaker-ip: IP address of Beaker machine - Optional fields: - - beaker-user: SSH username (defaults to 'root') - - Vault secret synced to K8s secret 'beaker-bm' in 'test-credentials' namespace - - Outputs: - - ${SHARED_DIR}/beaker_info: Beaker machine configuration and metadata - - ${SHARED_DIR}/kubeconfig: Kind cluster kubeconfig for subsequent steps - - ${ARTIFACT_DIR}/beaker-logs/: Environment preparation logs and diagnostics - - ${ARTIFACT_DIR}/kind-cluster-logs/: Cluster creation logs - - After successful completion: - - Beaker machine has all required tools installed (Docker, kubectl, kind, Rust, Go) - - Kind cluster is running and accessible - - kubeconfig is available for subsequent operator deployment steps - - Local container registry is ready at localhost:5000 - - Scope: - This chain is responsible ONLY for infrastructure preparation. - Application deployment (operator) is handled by a separate chain. diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/trusted-execution-clusters-operator-lifecycle-chain.metadata.json b/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/trusted-execution-clusters-operator-lifecycle-chain.metadata.json deleted file mode 100644 index 89bc82e33d083..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/trusted-execution-clusters-operator-lifecycle-chain.metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "path": "trusted-execution-clusters/operator/lifecycle/trusted-execution-clusters-operator-lifecycle-chain.yaml", - "owners": { - "approvers": [ - "alicefr", - "Jakob-Naucke", - "fangge1212", - "yalzhang" - ], - "reviewers": [ - "alicefr", - "yalzhang", - "Jakob-Naucke", - "fangge1212" - ] - } -} \ No newline at end of file diff --git a/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/trusted-execution-clusters-operator-lifecycle-chain.yaml b/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/trusted-execution-clusters-operator-lifecycle-chain.yaml deleted file mode 100644 index c9bfa6f205112..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/operator/lifecycle/trusted-execution-clusters-operator-lifecycle-chain.yaml +++ /dev/null @@ -1,28 +0,0 @@ -chain: - as: trusted-execution-clusters-operator-lifecycle - steps: - - ref: trusted-execution-clusters-ref-operator-test - documentation: |- - Runs the complete integration test suite for trusted-execution-clusters operator. - - Operator Integration Tests (trusted-execution-clusters-ref-operator-test): - - Connects to Beaker machine via SSH - - Uses operator repository from beaker-kind-provision step - - Verifies Kind cluster (creates if needed with 'make cluster-up') - - Builds and tests the operator: - * make push: Build and push images to localhost:5000/trusted-execution-clusters - * make install-kubevirt: Install KubeVirt dependency - * make integration-tests: Run complete integration test suite - - Collects test results and cluster state - - Prerequisites: - - SSH credentials available in Vault - - Operator repository downloaded by beaker-kind-provision step - - Local registry running at localhost:5000 - - Environment Variables: - - POD_READY_TIMEOUT: Test timeout (default: 900s) - - Outputs: - - ${ARTIFACT_DIR}/operator-test-logs/: Test logs and diagnostics - - ${ARTIFACT_DIR}/operator-test-logs/installation.log: Complete test execution log diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/ref/OWNERS index 79b41a3630fd0..ff09338355592 100644 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/OWNERS +++ b/ci-operator/step-registry/trusted-execution-clusters/ref/OWNERS @@ -1,10 +1,6 @@ reviewers: - alicefr - - yalzhang - Jakob-Naucke - - fangge1212 approvers: - alicefr - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-commands.sh b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-commands.sh deleted file mode 100755 index 959698a743bad..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-commands.sh +++ /dev/null @@ -1,664 +0,0 @@ -#!/bin/bash - -# Beaker Deprovision Step - Cleanup all resources on Beaker machine -set -o nounset -set -o pipefail - -if [ -z "${SHARED_DIR:-}" ]; then - echo "[ERROR] SHARED_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -if [ -z "${ARTIFACT_DIR:-}" ]; then - echo "[ERROR] ARTIFACT_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -echo "Beaker Cleanup and Deprovision - Starting" -echo "This script performs cleanup operations on Beaker machine" -date - -if ! whoami &> /dev/null; then - if [[ -w /etc/passwd ]]; then - echo "[INFO] Creating user entry for UID $(id -u) in /etc/passwd" - echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd - fi -fi - -# Cleanup status tracking -CLEANUP_FAILED=false - -# Configurable options -DEPROVISION_TIMEOUT="${DEPROVISION_TIMEOUT:-600}" -CLEANUP_IMAGES="${CLEANUP_IMAGES:-true}" -RESTART_SERVICES="${RESTART_SERVICES:-true}" - -# Helper Functions -log_info() { - echo "[INFO] $1" -} - -log_warn() { - echo "[WARN] $1" -} - -log_error() { - echo "[ERROR] $1" -} - -log_success() { - echo "[SUCCESS] $1" -} - -log_info "Reading configuration..." - -if [ -f "${SHARED_DIR}/beaker_info" ]; then - source "${SHARED_DIR}/beaker_info" - log_info "Beaker machine: ${BEAKER_IP}" - log_info "Beaker user: ${BEAKER_USER}" - log_info "Cluster name: ${KIND_CLUSTER_NAME:-kind}" -else - log_warn "beaker_info not found, attempting to use environment variables" - - # Try Vault or environment - if [ -z "${BEAKER_IP:-}" ]; then - if [ -f "/var/run/beaker-bm/beaker-ip" ]; then - BEAKER_IP=$(cat "/var/run/beaker-bm/beaker-ip") - else - log_error "Cannot determine BEAKER_IP" - exit 1 - fi - fi - - if [ -z "${BEAKER_USER:-}" ]; then - if [ -f "/var/run/beaker-bm/beaker-user" ]; then - BEAKER_USER=$(cat "/var/run/beaker-bm/beaker-user") - else - BEAKER_USER="root" - fi - fi - - KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:-kind}" -fi - -log_info "Setting up SSH key..." - -SSH_PKEY_PATH_VAULT="/var/run/beaker-bm/beaker-ssh-private-key" - -if [ -f "${SSH_PKEY_PATH_VAULT}" ]; then - SSH_PKEY_PATH="${SSH_PKEY_PATH_VAULT}" -elif [ -n "${CLUSTER_PROFILE_DIR:-}" ] && [ -f "${CLUSTER_PROFILE_DIR}/ssh-key" ]; then - SSH_PKEY_PATH="${CLUSTER_PROFILE_DIR}/ssh-key" -else - log_error "SSH key not found" - exit 1 -fi - -SSH_PKEY="${HOME}/.ssh/beaker_key" -mkdir -p "${HOME}/.ssh" -cp "${SSH_PKEY_PATH}" "${SSH_PKEY}" -chmod 600 "${SSH_PKEY}" - -SSHOPTS=( - -o 'ConnectTimeout=120' - -o 'StrictHostKeyChecking=no' - -o 'UserKnownHostsFile=/dev/null' - -o 'ServerAliveInterval=30' - -o 'ServerAliveCountMax=5' - -o 'LogLevel=ERROR' - -i "${SSH_PKEY}" -) - -log_info "SSH connection timeout set to 120 seconds to accommodate slow network" - -log_info "Collecting pre-cleanup system state..." - -mkdir -p "${ARTIFACT_DIR}/cleanup-logs/archived-logs" - -# Collect system state before cleanup -PRE_CLEANUP_LOG="${ARTIFACT_DIR}/cleanup-logs/pre-cleanup-state.log" -ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s <<'EOF' > "${PRE_CLEANUP_LOG}" 2>&1 || true - -echo "Pre-Cleanup System State" -echo "Date: $(date)" -echo "Hostname: $(hostname)" -echo "" - -echo "--- Kind Clusters ---" -kind get clusters 2>&1 || echo "No clusters or kind not available" -echo "" - -echo "--- Container Status ---" -docker ps -a 2>&1 || echo "Docker not available" -echo "" - -echo "--- Container Images ---" -docker images 2>&1 || echo "Docker not available" -echo "" - -echo "--- Build Artifacts ---" -ls -ld "${HOME}/investigations" 2>&1 || echo "No investigations directory" -ls -ld /var/log/kbs_logs_* 2>&1 || echo "No KBS log directories" -echo "" - -echo "--- Disk Usage ---" -df -h -echo "" - -echo "--- Temporary Directories ---" -ls -la /tmp/kind-* 2>&1 || echo "No kind temp directories" -ls -la /tmp/operator-* 2>&1 || echo "No operator temp directories" -ls -la /tmp/e2e-test-* 2>&1 || echo "No E2E test temp directories" -echo "" - -EOF - -log_success "Pre-cleanup state collected" - -log_info "Archiving important logs before cleanup..." - -# Archive logs (best effort) -scp "${SSHOPTS[@]}" -r \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/kind-deployment-logs/*" \ - "${ARTIFACT_DIR}/cleanup-logs/archived-logs/" 2>&1 || log_warn "Could not archive kind deployment logs" - -scp "${SSHOPTS[@]}" -r \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/kind-cluster-logs/*" \ - "${ARTIFACT_DIR}/cleanup-logs/archived-logs/" 2>&1 || log_warn "Could not archive kind cluster logs" - -scp "${SSHOPTS[@]}" -r \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/operator-install-logs/*" \ - "${ARTIFACT_DIR}/cleanup-logs/archived-logs/" 2>&1 || log_warn "Could not archive operator install logs" - -log_success "Log archiving completed (best effort)" - -log_info "Executing cleanup operations on Beaker machine..." - -if ! ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s -- \ - "${KIND_CLUSTER_NAME}" "${CLEANUP_IMAGES}" "${RESTART_SERVICES}" << 'EOF' -set -x # Enable command tracing for debugging - -KIND_CLUSTER_NAME="$1" -CLEANUP_IMAGES="$2" -RESTART_SERVICES="$3" - -echo "Running on Beaker machine: $(hostname)" -echo "Date: $(date)" -echo "Cluster to delete: ${KIND_CLUSTER_NAME}" -echo "Cleanup images: ${CLEANUP_IMAGES}" -echo "Restart services: ${RESTART_SERVICES}" - -# Create cleanup log directory -mkdir -p /tmp/cleanup-logs -exec > >(tee -a /tmp/cleanup-logs/cleanup.log) -exec 2>&1 - -CLEANUP_ERRORS=0 - -echo "--- Step 1: Deleting Kind cluster '${KIND_CLUSTER_NAME}' ---" - -if command -v kind &> /dev/null; then - if kind get clusters 2>/dev/null | grep -q "^${KIND_CLUSTER_NAME}$"; then - echo "[INFO] Deleting cluster ${KIND_CLUSTER_NAME}..." - if kind delete cluster --name "${KIND_CLUSTER_NAME}"; then - echo "[SUCCESS] Cluster ${KIND_CLUSTER_NAME} deleted" - else - echo "[ERROR] Failed to delete cluster ${KIND_CLUSTER_NAME}" - ((CLEANUP_ERRORS++)) - fi - else - echo "[INFO] Cluster ${KIND_CLUSTER_NAME} not found, skipping" - fi -else - echo "[WARN] kind command not found, skipping cluster deletion" -fi - -# Clean up kubeconfig -echo "[INFO] Cleaning up kubeconfig..." -if command -v kubectl &> /dev/null; then - kubectl config delete-context "${KIND_CLUSTER_NAME}" 2>&1 || echo "[INFO] Context not found" - kubectl config delete-cluster "${KIND_CLUSTER_NAME}" 2>&1 || echo "[INFO] Cluster not found" - echo "[SUCCESS] Kubeconfig cleaned up" -else - echo "[WARN] kubectl not found, skipping kubeconfig cleanup" -fi - -# Clean up kind-registry container (created by kind for local image caching) -echo "[INFO] Cleaning up kind-registry container..." -if docker ps -a --filter "name=kind-registry" -q | grep -q .; then - docker rm -f kind-registry 2>&1 || echo "[WARN] Could not remove kind-registry" - echo "[SUCCESS] kind-registry container removed" -else - echo "[INFO] kind-registry container not found" -fi - -echo "--- Step 2: Cleaning up test containers and images ---" -echo "[INFO] NOTE: Docker and kind packages/binaries are PRESERVED (not removed)" -echo "[INFO] This step only removes test artifacts (containers, volumes, dangling images)" - -if [ "${CLEANUP_IMAGES}" == "true" ]; then - if command -v docker &> /dev/null; then - echo "[INFO] Cleaning up test-related Docker resources..." - - # Remove Kind-labeled containers first - docker ps -a --filter "label=io.x-k8s.kind.cluster" -q | xargs -r docker rm -f 2>&1 || echo "[INFO] No Kind containers to remove" - - # Remove all stopped containers (safe - only removes stopped ones) - docker container prune -f 2>&1 || echo "[WARN] Container prune failed" - - # Remove Kind volumes - docker volume ls --filter "label=io.x-k8s.kind.cluster" -q | xargs -r docker volume rm 2>&1 || echo "[INFO] No Kind volumes to remove" - - # Remove test-specific operator images (keeping infrastructure images) - echo "[INFO] Removing test operator images..." - echo "[INFO] Keeping: kindest/node, registry:2 (infrastructure images)" - - # Remove images matching operator names (all tags and registries) - for image_pattern in "compute-pcrs" "registration-server" "trusted-cluster-operator" "attestation-key-register" "fedora-coreos-kubevirt" "key-broker-service"; do - echo "[INFO] Removing images matching pattern: ${image_pattern}" - docker images --format "{{.Repository}}:{{.Tag}}" | grep -i "${image_pattern}" | xargs -r docker rmi -f 2>&1 || echo "[INFO] No ${image_pattern} images to remove" - done - - echo "[SUCCESS] Test operator images removed" - - # Preserve dangling images (Docker build cache) for faster subsequent builds - - # Remove unused volumes (safe - only removes volumes not attached to containers) - docker volume prune -f 2>&1 || echo "[WARN] Volume prune failed" - - echo "[SUCCESS] Docker resources cleaned up" - fi -else - echo "[INFO] Container resource cleanup skipped" -fi - -echo "--- Step 3: Cleaning up Kind network resources ---" - -# Clean up only Kind-created networks -if command -v docker &> /dev/null; then - echo "[INFO] Cleaning up Kind-created Docker networks..." - - # Remove networks with kind label - docker network ls --filter "label=io.x-k8s.kind.cluster" -q | xargs -r docker network rm 2>&1 || echo "[INFO] No labeled Kind networks to remove" - - # Force remove the 'kind' network (may need to disconnect containers first) - if docker network inspect kind &> /dev/null; then - echo "[INFO] Disconnecting all containers from kind network..." - # Disconnect any connected containers - docker network inspect kind --format '{{range .Containers}}{{.Name}} {{end}}' | \ - xargs -r -n1 docker network disconnect -f kind 2>&1 || echo "[INFO] No containers to disconnect" - - # Now remove the network - if docker network rm kind 2>&1; then - echo "[SUCCESS] kind network removed" - else - echo "[WARN] Could not remove kind network" - fi - else - echo "[INFO] kind network not found" - fi - - echo "[SUCCESS] Kind networks cleaned up" -fi - -echo "--- Step 4: Skipping container runtime data directories ---" - -echo "[INFO] NOT deleting /var/lib/docker, /var/lib/containerd, or /var/lib/containers" -echo "[INFO] These directories must be preserved to keep Docker/containerd functional" -echo "[INFO] Only test-specific containers and images are cleaned up in previous steps" - -echo "[SUCCESS] Container runtime data directories preserved" - -echo "--- Step 5: Removing temporary files ---" -echo "[INFO] NOTE: Only /tmp directories are removed (kind/docker binaries are preserved)" - -rm -rf /tmp/kind-* 2>&1 || echo "[WARN] Could not remove kind temp directories" -rm -rf /tmp/operator-* 2>&1 || echo "[WARN] Could not remove operator temp directories" -rm -rf /tmp/e2e-test-* 2>&1 || echo "[WARN] Could not remove e2e-test temp directories" - -echo "[SUCCESS] Temporary files cleaned up" - -echo "--- Step 6: Cleaning up operator working directories ---" - -if [ -d "${HOME}/operator-kind-setup" ]; then - echo "[INFO] Removing operator-kind-setup directory..." - rm -rf "${HOME}/operator-kind-setup" 2>&1 || echo "[WARN] Could not remove operator-kind-setup" -fi - -if [ -d "${HOME}/operator-pr-code" ]; then - echo "[INFO] Removing operator-pr-code directory..." - rm -rf "${HOME}/operator-pr-code" 2>&1 || echo "[WARN] Could not remove operator-pr-code" -fi - -echo "[SUCCESS] Operator working directories cleaned up" - -echo "--- Step 7: Cleaning up test logs ---" - -sudo rm -rf /var/log/kbs_logs_* 2>&1 || echo "[WARN] Could not remove KBS logs" -rm -rf /tmp/e2e-test-logs 2>&1 || echo "[WARN] Could not remove E2E test logs" - -echo "[SUCCESS] Test logs cleaned up" - -echo "--- Step 8: Restarting Docker to reset containerd state ---" - -if [ "${RESTART_SERVICES}" == "true" ]; then - if command -v docker &> /dev/null; then - echo "[INFO] Restarting Docker service to clear containerd metadata..." - echo "[INFO] This prevents containerd corruption between test runs" - - # Restart Docker (also restarts containerd) - if sudo systemctl restart docker 2>&1; then - echo "[SUCCESS] Docker service restarted" - - # Wait 60 seconds for Docker daemon and containerd to fully initialize - echo "[INFO] Waiting 60 seconds for Docker/containerd to fully initialize..." - sleep 60 - - if docker info > /dev/null 2>&1; then - echo "[SUCCESS] Docker daemon is responsive after restart" - - # Critical: Test actual container creation to verify containerd health - echo "[INFO] Testing containerd health by creating test container..." - MAX_RETRIES=5 - RETRY_COUNT=0 - while [ ${RETRY_COUNT} -lt ${MAX_RETRIES} ]; do - if docker run --rm alpine echo "Containerd healthy" > /dev/null 2>&1; then - echo "[SUCCESS] Containerd is healthy (can create containers)" - break - else - RETRY_COUNT=$((RETRY_COUNT + 1)) - if [ ${RETRY_COUNT} -lt ${MAX_RETRIES} ]; then - echo "[WARN] Containerd test failed (attempt ${RETRY_COUNT}/${MAX_RETRIES}), waiting 10 seconds..." - sleep 10 - else - echo "[ERROR] Containerd STILL FAILING after ${MAX_RETRIES} attempts!" - echo "[ERROR] Manual intervention may be required" - fi - fi - done - else - echo "[ERROR] Docker daemon not responsive after 60 seconds" - fi - else - echo "[ERROR] Failed to restart Docker service" - fi - fi -else - echo "[INFO] Service restart skipped" -fi - -echo "--- Step 9: Verifying clean state ---" - -# Verification checks -echo "[INFO] Checking remaining Docker containers..." -DOCKER_CONTAINERS=$(docker ps -aq 2>/dev/null | wc -l) - -echo "Docker containers: $DOCKER_CONTAINERS" - -# Validate Docker/containerd health -echo "[INFO] Validating Docker/containerd state..." -if docker run --rm hello-world > /dev/null 2>&1; then - echo "[SUCCESS] Docker/containerd can create containers successfully" -else - echo "[ERROR] Docker/containerd health check failed" - echo "[ERROR] This may indicate containerd corruption" -fi - -echo "--- Cleanup Summary ---" -echo "Cleanup errors encountered: ${CLEANUP_ERRORS}" -echo "Disk Usage:" -df -h - -exit 0 - -EOF -then - log_error "Cleanup script execution failed" - CLEANUP_FAILED=true -else - log_success "Cleanup script executed successfully" -fi - -log_info "Collecting cleanup logs..." - -scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/cleanup-logs/cleanup.log" \ - "${ARTIFACT_DIR}/cleanup-logs/cleanup-execution.log" 2>&1 || log_warn "Could not collect cleanup execution log" - -log_info "Collecting post-cleanup system state..." - -POST_CLEANUP_LOG="${ARTIFACT_DIR}/cleanup-logs/post-cleanup-state.log" -ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s <<'EOF' > "${POST_CLEANUP_LOG}" 2>&1 || true - -echo "Post-Cleanup System State" -echo "Date: $(date)" -echo "" - -echo "--- Kind Clusters (should be empty) ---" -kind get clusters 2>&1 || echo "No clusters or kind not available" -echo "" - -echo "--- Docker Containers (should be minimal) ---" -docker ps -a 2>&1 || echo "Docker not available" -echo "" - -echo "--- Container Images ---" -docker images 2>&1 || echo "Docker not available" -echo "" - -echo "--- Build Artifacts (should be removed) ---" -ls -ld "${HOME}/investigations" 2>&1 || echo "No investigations directory (cleaned)" -ls -ld /var/log/kbs_logs_* 2>&1 || echo "No KBS log directories (cleaned)" -echo "" - -echo "--- Disk Usage (after cleanup) ---" -df -h -echo "" - -echo "--- Temporary Directories (should be minimal) ---" -ls -la /tmp/ | grep -E "kind|operator|e2e" || echo "No test-related temp directories found" -echo "" - -EOF - -log_success "Post-cleanup state collected" - -if $CLEANUP_FAILED; then - echo "Beaker Cleanup - COMPLETED WITH ERRORS" - echo "Some cleanup operations failed" - echo "Check logs in ${ARTIFACT_DIR}/cleanup-logs/" - echo "Note: Cleanup failures are non-fatal. Exiting with success." - date - exit 0 -fi - -echo "Beaker Cleanup - Completed Successfully" -echo "Beaker Machine: ${BEAKER_IP}" -echo "Cluster Deleted: ${KIND_CLUSTER_NAME}" -echo "Images Cleaned: ${CLEANUP_IMAGES}" -echo "Services Restarted: ${RESTART_SERVICES}" -echo "" -echo "Cleanup Logs: ${ARTIFACT_DIR}/cleanup-logs/" -echo "Archived Logs: ${ARTIFACT_DIR}/cleanup-logs/archived-logs/" -date - -# ============================================================================ -# CRITICAL: Release Exclusive Lock on Beaker Machine -# ============================================================================ -# This releases the lock acquired by the provision script, allowing other -# CI jobs to use the Beaker machine. -# -# The lock is held by a background process started during provisioning. -# We release it by sending a SIGUSR1 signal to that process. -# ============================================================================ - -log_info "Releasing exclusive lock on Beaker machine..." - -if [ -f "${SHARED_DIR}/beaker_lock_info" ]; then - source "${SHARED_DIR}/beaker_lock_info" - - log_info "Lock was acquired at: ${LOCK_ACQUIRED_AT:-unknown}" - log_info "Lock file: ${LOCK_FILE}" - log_info "Lock holder PID: ${LOCK_HOLDER_PID}" - log_info "Lock holder ID: ${LOCK_HOLDER_ID}" - log_info "Lock holder log: ${LOCK_HOLDER_LOG:-/tmp/lock-holder.log}" - - # Release the lock by signaling the lock holder process - if ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s -- \ - "${LOCK_FILE}" "${LOCK_HOLDER_PID}" "${LOCK_HOLDER_LOG:-/tmp/lock-holder.log}" << 'RELEASESCRIPT' -LOCK_FILE="$1" -LOCK_HOLDER_PID="$2" -LOCK_HOLDER_LOG="$3" - -echo "[INFO] Releasing lock by terminating lock holder process..." -echo "[INFO] Lock file: ${LOCK_FILE}" -echo "[INFO] Lock holder PID: ${LOCK_HOLDER_PID}" -echo "[INFO] Lock holder log: ${LOCK_HOLDER_LOG}" - -# Check if lock holder process is still running -if ps -p "${LOCK_HOLDER_PID}" > /dev/null 2>&1; then - echo "[INFO] Lock holder process is running, sending SIGUSR1 signal..." - - # Send SIGUSR1 to gracefully release the lock - if kill -USR1 "${LOCK_HOLDER_PID}" 2>/dev/null; then - echo "[INFO] Signal sent, waiting for process to exit..." - - # Wait up to 10 seconds for graceful exit - for i in {1..10}; do - if ! ps -p "${LOCK_HOLDER_PID}" > /dev/null 2>&1; then - echo "[SUCCESS] Lock holder process exited gracefully" - break - fi - sleep 1 - done - - # If still running, force kill - if ps -p "${LOCK_HOLDER_PID}" > /dev/null 2>&1; then - echo "[WARN] Lock holder did not exit gracefully, forcing termination..." - kill -9 "${LOCK_HOLDER_PID}" 2>/dev/null || true - sleep 1 - fi - else - echo "[WARN] Failed to send signal, trying force kill..." - kill -9 "${LOCK_HOLDER_PID}" 2>/dev/null || true - fi -else - echo "[INFO] Lock holder process not running (already exited or timed out)" -fi - -# Clean up lock files for THIS job only -# IMPORTANT: Only delete files that belong to our PID! -echo "[INFO] Cleaning up lock files for this job..." - -# Check if .holder and .pid files belong to our job before deleting -CURRENT_PID_IN_FILE="" -if [ -f "${LOCK_FILE}.pid" ]; then - CURRENT_PID_IN_FILE=$(cat "${LOCK_FILE}.pid" 2>/dev/null || echo "") -fi - -if [ "${CURRENT_PID_IN_FILE}" = "${LOCK_HOLDER_PID}" ]; then - # These files belong to our job, safe to delete - echo "[INFO] Lock files belong to our job (PID ${LOCK_HOLDER_PID}), deleting..." - rm -f "${LOCK_FILE}.holder" "${LOCK_FILE}.pid" "${LOCK_FILE}.holder.tmp" "${LOCK_FILE}.pid.tmp" 2>/dev/null || true - echo "[INFO] Deleted .holder and .pid files" -else - # Another job has already acquired the lock and created new files - echo "[INFO] Lock files already updated by next job (PID ${CURRENT_PID_IN_FILE}), not deleting" - # Clean up any temp files from our job that might be left over - rm -f "${LOCK_FILE}.holder.tmp" "${LOCK_FILE}.pid.tmp" 2>/dev/null || true -fi - -# Only delete our specific log file, not others -if [ -f "${LOCK_HOLDER_LOG}" ]; then - rm -f "${LOCK_HOLDER_LOG}" 2>/dev/null || true - echo "[INFO] Deleted lock holder log: ${LOCK_HOLDER_LOG}" -fi - -# Only delete the main lock file if no other jobs are waiting -# Check if there are any other hold_lock.sh processes running -# Use wc -l instead of pgrep -c to avoid multi-line output issues -OTHER_LOCK_PROCESSES=$(pgrep "hold_lock.sh" 2>/dev/null | wc -l) -if [ "${OTHER_LOCK_PROCESSES}" -eq "0" ]; then - echo "[INFO] No other lock holder processes detected, safe to remove lock file" - rm -f "${LOCK_FILE}" 2>/dev/null || true - # Verify it's deleted - if [ ! -f "${LOCK_FILE}" ]; then - echo "[SUCCESS] Lock file removed" - else - echo "[WARN] Lock file still exists after removal attempt" - fi -else - echo "[INFO] Other jobs waiting for lock (${OTHER_LOCK_PROCESSES} processes), keeping lock file" - # The lock file will be released automatically when the flock is released -fi - -# Verify our job's lock is released -if [ ! -f "${LOCK_FILE}.pid" ]; then - echo "[SUCCESS] This job's lock files cleaned up successfully" - echo "[SUCCESS] Beaker machine is now available for next CI job" - exit 0 -else - echo "[WARN] Some lock files may still exist, but this job's process is terminated" - exit 0 -fi -RELEASESCRIPT - then - log_success "Lock released successfully on Beaker machine" - log_info "Beaker machine is now available for other CI jobs" - else - log_warn "Lock release script failed, but this is non-fatal" - log_warn "Lock will auto-release after 4-hour safety timeout" - fi - - # Archive lock holder log for debugging (before it gets deleted) - log_info "Archiving lock holder log..." - if [ -n "${LOCK_HOLDER_LOG:-}" ]; then - scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:${LOCK_HOLDER_LOG}" \ - "${ARTIFACT_DIR}/cleanup-logs/lock-holder.log" 2>&1 || log_warn "Could not archive lock holder log" - else - log_warn "Lock holder log path not found in beaker_lock_info" - fi - -else - log_warn "Lock info not found in ${SHARED_DIR}/beaker_lock_info" - log_warn "This might mean the lock was never acquired or already released" - log_info "Attempting cleanup of any stale lock files..." - - # Best-effort cleanup when beaker_lock_info is missing - # This handles timeout scenarios where the job never saved lock info - LOCK_FILE="${LOCK_FILE:-/tmp/tec-operator-ci.lock}" - - ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s -- "${LOCK_FILE}" << 'STALE_CLEANUP' || log_warn "Stale lock cleanup had issues" -LOCK_FILE="$1" - -if ls "${LOCK_FILE}"* >/dev/null 2>&1; then - echo "[INFO] Found stale lock files, cleaning up..." - - # Try to kill any lock holder processes - if [ -f "${LOCK_FILE}.pid" ]; then - LOCK_PID=$(cat "${LOCK_FILE}.pid" 2>/dev/null || echo "") - if [ -n "${LOCK_PID}" ] && ps -p "${LOCK_PID}" >/dev/null 2>&1; then - echo "[INFO] Stopping lock holder process ${LOCK_PID}..." - kill -USR1 "${LOCK_PID}" 2>/dev/null || kill -9 "${LOCK_PID}" 2>/dev/null || true - fi - fi - - # Kill any orphaned hold_lock.sh processes - pkill -9 -f "hold_lock.sh" 2>/dev/null || true - - # Remove lock files - rm -f "${LOCK_FILE}" "${LOCK_FILE}.holder" "${LOCK_FILE}.pid" 2>/dev/null || true - - echo "[SUCCESS] Stale lock files cleaned up" -else - echo "[INFO] No stale lock files found" -fi -STALE_CLEANUP - - log_info "Stale lock cleanup completed" -fi - -log_info "Lock release procedure completed" - -exit 0 diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-ref.metadata.json b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-ref.metadata.json deleted file mode 100644 index 5f6d48f6a4002..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-ref.metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "path": "trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-ref.yaml", - "owners": { - "approvers": [ - "alicefr", - "Jakob-Naucke", - "fangge1212", - "yalzhang" - ], - "reviewers": [ - "alicefr", - "yalzhang", - "Jakob-Naucke", - "fangge1212" - ] - } -} \ No newline at end of file diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-ref.yaml b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-ref.yaml deleted file mode 100644 index 349256c73a814..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-deprovision/trusted-execution-clusters-ref-operator-beaker-deprovision-ref.yaml +++ /dev/null @@ -1,78 +0,0 @@ -ref: - as: trusted-execution-clusters-ref-operator-beaker-deprovision - from_image: - namespace: ci - name: telco-runner - tag: latest - commands: trusted-execution-clusters-ref-operator-beaker-deprovision-commands.sh - credentials: - - namespace: test-credentials - name: beaker-bm - mount_path: /var/run/beaker-bm - resources: - requests: - cpu: 500m - memory: 500Mi - limits: - memory: 1Gi - env: - - name: BEAKER_IP - default: "" - documentation: |- - IP address of the Beaker provisioned bare metal machine. - If not set, will read from Vault or ${SHARED_DIR}/beaker_ip - - name: BEAKER_USER - default: "root" - documentation: |- - Username for SSH access to the Beaker machine. - If not set, will read from Vault or default to 'root' - - name: DEPROVISION_TIMEOUT - default: "600" - documentation: |- - Timeout in seconds for the cleanup operations. - Defaults to 600 seconds (10 minutes). - - name: CLEANUP_IMAGES - default: "true" - documentation: |- - Whether to clean up container images (true/false). - Defaults to 'true'. Set to 'false' to preserve images for debugging. - - name: RESTART_SERVICES - default: "true" - documentation: |- - Whether to restart container runtime services after cleanup (true/false). - Defaults to 'true'. - documentation: |- - This step performs cleanup operations on the Beaker bare metal machine, - removing all resources created during CI testing. - - Cleanup operations performed: - 1. Delete Kind cluster using 'kind delete cluster' - 2. Clean container resources (containers, images, volumes) - 3. Remove temporary files (/tmp/kind-*, /tmp/operator-*) - 4. Clean operator working directories (~/operator-kind-setup, ~/operator-pr-code) - 5. Optionally restart Docker service (if RESTART_SERVICES=true) - 6. Archive important logs before cleanup - 7. Verify cleanup completed successfully - - Cleanup Strategy: - - Best-effort cleanup: Individual operations may fail without causing the step to fail - - Ensures cleanup proceeds even if some resources are already gone - - Archives logs before deletion to preserve debugging information - - Restarts services to ensure clean state for future runs - - Prerequisites: - - SSH credentials must be available in Vault - - Beaker machine must be accessible - - Outputs: - - ${ARTIFACT_DIR}/cleanup-logs/pre-cleanup-state.log: System state before cleanup - - ${ARTIFACT_DIR}/cleanup-logs/post-cleanup-state.log: System state after cleanup - - ${ARTIFACT_DIR}/cleanup-logs/cleanup-execution.log: Detailed cleanup operation log - - ${ARTIFACT_DIR}/cleanup-logs/archived-logs/: Logs archived before deletion - - Exit Codes: - - 0: Cleanup completed successfully (best effort) - - 1: Critical failure (e.g., SSH connection issues) - - Note: This cleanup is designed for Docker-based Kind clusters only. - It does not handle Podman, libvirt, or E2E test resources. diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-commands.sh b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-commands.sh deleted file mode 100755 index dcf3e4dd62daa..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-commands.sh +++ /dev/null @@ -1,893 +0,0 @@ -#!/bin/bash - -set -o nounset -set -o pipefail - -if [ -z "${SHARED_DIR:-}" ]; then - echo "[ERROR] SHARED_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -if [ -z "${ARTIFACT_DIR:-}" ]; then - echo "[ERROR] ARTIFACT_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -echo "==========================================" -echo "Beaker Environment Preparation - Starting" -echo "==========================================" -date - -# Prow CI User Environment Setup -if ! whoami &> /dev/null; then - if [[ -w /etc/passwd ]]; then - echo "[INFO] Creating user entry for UID $(id -u) in /etc/passwd" - echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd - else - echo "[WARN] Cannot write to /etc/passwd, SSH may encounter issues" - fi -fi - -if whoami &> /dev/null; then - echo "[INFO] Current user: $(whoami) (UID: $(id -u))" -fi - -# Global Variables -DEPLOYMENT_STATUS=0 -CRITICAL_FAILURE=false - -SETUP_SCRIPT_TIMEOUT="${SETUP_SCRIPT_TIMEOUT:-1800}" - -OPERATOR_REPO="${OPERATOR_REPO:-https://github.com/trusted-execution-clusters/operator.git}" -OPERATOR_BRANCH="${OPERATOR_BRANCH:-main}" - -TOTAL_STEPS=8 -CURRENT_STEP=0 - -# Helper Functions -progress() { - CURRENT_STEP=$((CURRENT_STEP + 1)) - echo "" - echo "==========================================" - echo "Step ${CURRENT_STEP}/${TOTAL_STEPS}: $1" - echo "==========================================" -} - -log_info() { - echo "[INFO] $1" -} - -log_warn() { - echo "[WARN] $1" -} - -log_error() { - echo "[ERROR] $1" -} - -log_success() { - echo "[SUCCESS] $1" -} - -# Environment Variables Check -progress "Checking environment variables" - -if [ -z "${BEAKER_IP:-}" ]; then - if [ -f "/var/run/beaker-bm/beaker-ip" ]; then - BEAKER_IP=$(cat "/var/run/beaker-bm/beaker-ip") - log_info "Read Beaker IP from Vault secret: ${BEAKER_IP}" - elif [ -f "${SHARED_DIR}/beaker_ip" ]; then - BEAKER_IP=$(cat "${SHARED_DIR}/beaker_ip") - log_info "Read Beaker IP from SHARED_DIR: ${BEAKER_IP}" - else - log_error "BEAKER_IP not found" - exit 1 - fi -else - log_info "Using BEAKER_IP from environment variable: ${BEAKER_IP}" -fi - -if [ -z "${BEAKER_USER:-}" ]; then - if [ -f "/var/run/beaker-bm/beaker-user" ]; then - BEAKER_USER=$(cat "/var/run/beaker-bm/beaker-user") - log_info "Read Beaker user from Vault secret: ${BEAKER_USER}" - else - BEAKER_USER="root" - log_info "Using default Beaker user: ${BEAKER_USER}" - fi -else - log_info "Using BEAKER_USER from environment variable: ${BEAKER_USER}" -fi - -KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:-kind}" -log_info "Kind cluster name: ${KIND_CLUSTER_NAME}" - -CONTAINER_RUNTIME="${CONTAINER_RUNTIME:-docker}" -log_info "Container runtime: ${CONTAINER_RUNTIME}" - -# SSH Key Setup -progress "Setting up SSH key" - -SSH_PKEY_PATH_VAULT="/var/run/beaker-bm/beaker-ssh-private-key" - -if [ -f "${SSH_PKEY_PATH_VAULT}" ]; then - SSH_PKEY_PATH="${SSH_PKEY_PATH_VAULT}" - log_info "Using SSH key from Vault: ${SSH_PKEY_PATH_VAULT}" -elif [ -n "${CLUSTER_PROFILE_DIR:-}" ] && [ -f "${CLUSTER_PROFILE_DIR}/ssh-key" ]; then - SSH_PKEY_PATH="${CLUSTER_PROFILE_DIR}/ssh-key" - log_info "Using SSH key from CLUSTER_PROFILE_DIR: ${CLUSTER_PROFILE_DIR}/ssh-key" -else - log_error "SSH key not found" - exit 1 -fi - -SSH_PKEY="${HOME}/.ssh/beaker_key" -mkdir -p "${HOME}/.ssh" -cp "${SSH_PKEY_PATH}" "${SSH_PKEY}" -chmod 600 "${SSH_PKEY}" -log_info "SSH private key configured at ${SSH_PKEY}" - -# SSH Options Configuration -progress "Configuring SSH connection" - -SSHOPTS=( - -o 'ConnectTimeout=120' - -o 'StrictHostKeyChecking=no' - -o 'UserKnownHostsFile=/dev/null' - -o 'ServerAliveInterval=30' - -o 'ServerAliveCountMax=5' - -o 'LogLevel=ERROR' - -i "${SSH_PKEY}" -) - -log_info "SSH connection timeout set to 120 seconds" - -# Pre-flight Connectivity Check -progress "Pre-flight connectivity check" - -log_info "Testing network connectivity to ${BEAKER_IP}..." -if timeout 5 ping -c 3 "${BEAKER_IP}" &>/dev/null; then - log_success "Beaker machine ${BEAKER_IP} responds to ping" -else - log_warn "Beaker machine ${BEAKER_IP} does not respond to ping (may be expected if ICMP is blocked)" -fi - -# SSH Connection Test with Retry -progress "Establishing SSH connection to Beaker machine" - -log_info "Testing SSH connection to ${BEAKER_USER}@${BEAKER_IP}..." - -MAX_SSH_ATTEMPTS=15 -BASE_RETRY_DELAY=5 - -for attempt in $(seq 1 $MAX_SSH_ATTEMPTS); do - RETRY_DELAY=$((BASE_RETRY_DELAY * attempt / 3)) - [ $RETRY_DELAY -gt 30 ] && RETRY_DELAY=30 - - if ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "echo 'SSH test successful'; hostname; uptime"; then - log_success "SSH connection established after ${attempt} attempt(s)" - break - else - if [[ $attempt -eq $MAX_SSH_ATTEMPTS ]]; then - log_error "Failed to establish SSH connection after ${MAX_SSH_ATTEMPTS} attempts" - exit 1 - fi - log_warn "SSH connection failed, attempt ${attempt}/${MAX_SSH_ATTEMPTS}. Retrying in ${RETRY_DELAY} seconds..." - sleep $RETRY_DELAY - fi -done - -# ============================================================================ -# CRITICAL: Acquire Exclusive Lock on Beaker Machine -# ============================================================================ -# This prevents multiple CI jobs from running simultaneously on the same -# Beaker machine, which would cause conflicts and test failures. -# -# Lock strategy: -# - Uses flock-based file locking similar to baremetal-lab approach -# - Lock acquired within a persistent script that runs in background -# - Lock held for entire test duration -# - Released by cleanup script in post phase -# -# Lock details: -# - Lock file: /tmp/tec-operator-ci.lock on Beaker machine -# - File descriptor: 200 -# - Acquisition timeout: 21600 seconds (6 hours) -# - Hold timeout: 10800 seconds (3 hours safety auto-release) -# - Behavior: If lock cannot be acquired within 6 hours, job fails -# ============================================================================ - -progress "Acquiring exclusive lock on Beaker machine" - -LOCK_FILE="/tmp/tec-operator-ci.lock" -LOCK_TIMEOUT=21600 # 6 hours in seconds (job runtime ~2h, this allows multiple jobs to queue) - -# Generate unique lock holder ID for this job -LOCK_HOLDER_ID="${NAMESPACE:-unknown}-${BUILD_ID:-unknown}-$(date +%s)" - -log_info "Lock file: ${LOCK_FILE}" -log_info "Lock holder ID: ${LOCK_HOLDER_ID}" -log_info "Lock timeout: ${LOCK_TIMEOUT} seconds (6 hours)" -log_info "This ensures only one CI job runs on the Beaker machine at a time" - -# Create lock acquisition and holding script -# This script will: -# 1. Acquire the lock using flock -# 2. Create a marker file with job info -# 3. Hold the lock by sleeping until signaled -# 4. Release when cleanup script sends signal or timeout occurs -cat > /tmp/hold_lock.sh << 'HOLDLOCKSCRIPT' -#!/bin/bash -set -o nounset -set -o pipefail - -LOCK_FILE="$1" -LOCK_TIMEOUT="$2" -LOCK_HOLDER_ID="$3" -LOCK_FD=200 - -echo "[INFO] Lock acquisition starting..." -echo "[INFO] Lock file: ${LOCK_FILE}" -echo "[INFO] Lock holder: ${LOCK_HOLDER_ID}" -echo "[INFO] Lock FD: ${LOCK_FD}" -echo "[INFO] Timeout: ${LOCK_TIMEOUT} seconds (6 hours)" - -# Cleanup on exit -cleanup_on_exit() { - echo "[INFO] Releasing lock (script exiting)..." - - # Release the flock - flock -u $LOCK_FD 2>/dev/null || true - eval "exec ${LOCK_FD}>&-" 2>/dev/null || true - - # IMPORTANT: Do NOT delete .holder and .pid files here! - # These files are needed by: - # 1. The cleanup script to verify lock release - # 2. The next job to verify it acquired the lock - # The cleanup script will handle deletion of these files properly - - echo "[INFO] Lock released (flock)" -} -trap cleanup_on_exit EXIT INT TERM - -# Open file descriptor for the lock file -touch "${LOCK_FILE}" -eval "exec ${LOCK_FD}<>\"${LOCK_FILE}\"" - -# Try to acquire the lock with timeout -echo "[INFO] Waiting for lock (max ${LOCK_TIMEOUT} seconds)..." -echo "[INFO] If another CI job is running, this job will wait in queue..." - -START_TIME=$(date +%s) - -if flock -w "${LOCK_TIMEOUT}" $LOCK_FD; then - WAIT_TIME=$(($(date +%s) - START_TIME)) - echo "[SUCCESS] Lock acquired after ${WAIT_TIME} seconds" - echo "[INFO] This CI job now has exclusive access to the Beaker machine" - - # Create marker file with job information - # Use atomic write: write to temp file, then rename - SCRIPT_PID=$BASHPID - { - echo "LOCK_HOLDER_ID=${LOCK_HOLDER_ID}" - echo "LOCK_ACQUIRED_AT=$(date -u +'%Y-%m-%d_%H:%M:%S_UTC')" - echo "LOCK_PID=${SCRIPT_PID}" - } > "${LOCK_FILE}.holder.tmp" - - # Atomic rename to ensure file is complete when it appears - mv "${LOCK_FILE}.holder.tmp" "${LOCK_FILE}.holder" - - # Save our PID for cleanup script (atomic write) - SCRIPT_PID=$BASHPID - printf "%s\n" "${SCRIPT_PID}" > "${LOCK_FILE}.pid.tmp" - sync # Ensure data is written to disk - mv "${LOCK_FILE}.pid.tmp" "${LOCK_FILE}.pid" - - echo "[INFO] Lock holder info saved" - echo "[INFO] Lock is now active and will be held until cleanup" - - # Hold the lock by waiting for signal from cleanup script - # Also add a safety timeout to auto-release after 3 hours - HOLD_TIMEOUT=10800 # 3 hours safety timeout (job runtime ~2h + buffer) - - echo "[INFO] Holding lock for up to ${HOLD_TIMEOUT} seconds (3 hour safety limit)..." - echo "[INFO] Lock will be explicitly released by cleanup script" - - # Wait for either: - # 1. SIGUSR1 signal from cleanup script (normal release) - # 2. Timeout after 3 hours (safety release) - sleep ${HOLD_TIMEOUT} & - SLEEP_PID=$! - - # Set up signal handler for cleanup script - trap "kill ${SLEEP_PID} 2>/dev/null; echo '[INFO] Received release signal from cleanup script'; exit 0" USR1 - - # Wait for either signal or timeout - wait ${SLEEP_PID} 2>/dev/null - - echo "[WARN] Lock holding timeout reached (3 hours) - auto-releasing" - exit 0 - -else - echo "[ERROR] Failed to acquire lock after ${LOCK_TIMEOUT} seconds" - echo "[ERROR] The Beaker machine is still busy with another CI job" - echo "[ERROR] " - echo "[ERROR] Current lock holder (if exists):" - cat "${LOCK_FILE}.holder" 2>/dev/null || echo "[ERROR] No lock holder info found" - echo "[ERROR] " - echo "[ERROR] This usually means:" - echo "[ERROR] - Another PR's test is still running" - echo "[ERROR] - A previous test failed to release the lock (stale lock)" - echo "[ERROR] " - echo "[ERROR] Recommended actions:" - echo "[ERROR] 1. Wait a few minutes and /retest" - echo "[ERROR] 2. Check if other PRs have running tests" - echo "[ERROR] 3. If lock is stale (holder PID not running), manually remove:" - echo "[ERROR] ssh to Beaker machine and: rm -f ${LOCK_FILE}*" - exit 1 -fi -HOLDLOCKSCRIPT - -chmod +x /tmp/hold_lock.sh - -# Transfer the lock script to Beaker machine -log_info "Transferring lock holder script to Beaker machine..." -if ! scp "${SSHOPTS[@]}" /tmp/hold_lock.sh "${BEAKER_USER}@${BEAKER_IP}:/tmp/hold_lock.sh"; then - log_error "Failed to transfer lock script to Beaker machine" - exit 3 -fi - -# Start the lock holding process in background on Beaker machine -# This process will keep running and holding the lock until cleanup -log_info "Starting lock holder process on Beaker machine..." -log_info "Acquiring lock (timeout: ${LOCK_TIMEOUT}s / 6 hours)..." - -# Use unique log file for this job to avoid conflicts -LOCK_HOLDER_LOG="/tmp/lock-holder-${LOCK_HOLDER_ID}.log" - -# Run the lock holder in background, save output to unique log file -if ! ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" \ - "nohup bash /tmp/hold_lock.sh '${LOCK_FILE}' '${LOCK_TIMEOUT}' '${LOCK_HOLDER_ID}' > '${LOCK_HOLDER_LOG}' 2>&1 &"; then - log_error "Failed to start lock holder process" - exit 3 -fi - -# Wait for lock holder process to either acquire the lock or fail -log_info "Waiting for lock holder process to acquire lock or timeout..." -log_info "This may take up to ${LOCK_TIMEOUT} seconds if another job is running..." - -# Poll the lock holder log to see if our job acquired the lock -MAX_POLL_TIME=$((LOCK_TIMEOUT + 30)) # Add 30s buffer for startup -POLL_INTERVAL=5 -ELAPSED=0 - -while [ ${ELAPSED} -lt ${MAX_POLL_TIME} ]; do - # Check if our specific lock holder ID appears in the holder file - CURRENT_HOLDER=$(ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" \ - "grep 'LOCK_HOLDER_ID' '${LOCK_FILE}.holder' 2>/dev/null | cut -d= -f2" || echo "") - - if [ "${CURRENT_HOLDER}" = "${LOCK_HOLDER_ID}" ]; then - # Our job acquired the lock! - log_success "Lock acquired successfully by this job!" - break - fi - - # Check if the lock holder process exited (failed to acquire) - LOCK_HOLDER_LOG_TAIL=$(ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" \ - "tail -5 '${LOCK_HOLDER_LOG}' 2>/dev/null" || echo "") - - if echo "${LOCK_HOLDER_LOG_TAIL}" | grep -q "Failed to acquire lock"; then - log_error "Lock acquisition failed - timeout exceeded" - log_error "Lock holder log shows:" - ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "cat '${LOCK_HOLDER_LOG}'" || true - exit 3 - fi - - # Still waiting, sleep and check again - if [ $((ELAPSED % 30)) -eq 0 ]; then - if [ "${CURRENT_HOLDER}" != "" ]; then - log_info "Still waiting for lock... (currently held by: ${CURRENT_HOLDER})" - else - log_info "Still waiting for lock... (${ELAPSED}s elapsed)" - fi - fi - - sleep ${POLL_INTERVAL} - ELAPSED=$((ELAPSED + POLL_INTERVAL)) -done - -# Verify lock was actually acquired by this job -FINAL_HOLDER=$(ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" \ - "grep 'LOCK_HOLDER_ID' '${LOCK_FILE}.holder' 2>/dev/null | cut -d= -f2" || echo "") - -if [ "${FINAL_HOLDER}" != "${LOCK_HOLDER_ID}" ]; then - log_error "Lock verification failed after ${ELAPSED}s" - log_error "Expected holder: ${LOCK_HOLDER_ID}" - log_error "Actual holder: ${FINAL_HOLDER}" - log_error "" - log_error "Lock holder log:" - ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "cat '${LOCK_HOLDER_LOG}'" || true - exit 3 -fi - -# Read lock holder info now that we've confirmed it's ours -LOCK_HOLDER_INFO=$(ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "cat '${LOCK_FILE}.holder'") -LOCK_HOLDER_PID=$(ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "cat '${LOCK_FILE}.pid'") - -log_success "Lock acquired and verified!" -log_info "Lock holder PID: ${LOCK_HOLDER_PID}" -log_info "Lock holder details:" -echo "${LOCK_HOLDER_INFO}" | while read line; do log_info " $line"; done - -# Save lock information to SHARED_DIR for the cleanup script -LOCK_ACQUIRED_AT_VALUE="$(date -u +'%Y-%m-%d_%H:%M:%S_UTC')" -cat > "${SHARED_DIR}/beaker_lock_info" << LOCKINFO -LOCK_FILE=${LOCK_FILE} -LOCK_HOLDER_ID=${LOCK_HOLDER_ID} -LOCK_HOLDER_PID=${LOCK_HOLDER_PID} -LOCK_HOLDER_LOG=${LOCK_HOLDER_LOG} -LOCK_ACQUIRED=true -LOCK_ACQUIRED_AT=${LOCK_ACQUIRED_AT_VALUE} -LOCKINFO - -log_info "Lock information saved to ${SHARED_DIR}/beaker_lock_info" -log_info "This job now has exclusive access to the Beaker machine" - -# Log Collection Function -collect_deployment_logs() { - log_info "Collecting deployment logs and artifacts..." - - local collection_failed=false - mkdir -p "${ARTIFACT_DIR}/beaker-logs" - - local REMOTE_SUDO="" - if [ "${BEAKER_USER}" != "root" ]; then - REMOTE_SUDO="sudo" - fi - - scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/kind-deployment-logs/deployment.log" \ - "${ARTIFACT_DIR}/beaker-logs/deployment.log" 2>&1 || { - log_warn "Failed to collect deployment log" - collection_failed=true - } - - ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" \ - "${REMOTE_SUDO} journalctl -u ${CONTAINER_RUNTIME} --no-pager -n 500" \ - > "${ARTIFACT_DIR}/beaker-logs/${CONTAINER_RUNTIME}.log" 2>&1 || { - log_warn "Failed to collect runtime logs" - collection_failed=true - } - - scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/kind-deployment-logs/*.log" \ - "${ARTIFACT_DIR}/beaker-logs/" 2>&1 || { - log_warn "Failed to collect Kind logs" - collection_failed=true - } - - ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" \ - "uname -a; echo '---'; free -h; echo '---'; df -h; echo '---'; ip addr" \ - > "${ARTIFACT_DIR}/beaker-logs/system-info.log" 2>&1 || { - log_warn "Failed to collect system info" - collection_failed=true - } - - if [ -f "${SHARED_DIR}/kubeconfig" ]; then - export KUBECONFIG="${SHARED_DIR}/kubeconfig" - kubectl get nodes -o yaml > "${ARTIFACT_DIR}/beaker-logs/k8s-nodes.yaml" 2>&1 || true - kubectl get pods -A -o yaml > "${ARTIFACT_DIR}/beaker-logs/k8s-pods.yaml" 2>&1 || true - kubectl version > "${ARTIFACT_DIR}/beaker-logs/k8s-version.log" 2>&1 || true - fi - - if $collection_failed; then - return 1 - else - log_success "All logs collected successfully" - return 0 - fi -} - -# Prepare Beaker Environment with Dependencies and operator Repository -progress "Preparing Beaker environment" - -log_info "Generating environment setup script..." - -cat > /tmp/beaker-setup.sh << 'SETUPSCRIPT' -#!/bin/bash -set -o nounset -set -o pipefail -set -x - -export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH:-}" - -KIND_CLUSTER_NAME="${1:-kind}" -CONTAINER_RUNTIME="${2:-docker}" -BEAKER_IP="${3:-127.0.0.1}" -OPERATOR_REPO="${4:-https://github.com/trusted-execution-clusters/operator.git}" -OPERATOR_BRANCH="${5:-main}" - -echo "==========================================" -echo "Running on Beaker machine: $(hostname)" -echo "Beaker IP: ${BEAKER_IP}" -echo "Runtime: ${CONTAINER_RUNTIME}" -echo "Date: $(date)" -echo "==========================================" - -if [ "$(id -u)" -eq 0 ]; then - SUDO="" -else - SUDO="sudo" -fi - -mkdir -p /tmp/kind-deployment-logs -exec > >(tee -a /tmp/kind-deployment-logs/deployment.log) -exec 2>&1 - -# Install Rust -echo "[INFO] Installing Rust via rustup..." -if ! command -v rustc &> /dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -else - echo "[INFO] Rust is already installed: $(rustc --version)" - source "$HOME/.cargo/env" - rustup update -fi - -RUST_PROFILE_SCRIPT="/etc/profile.d/rust.sh" -if ! [ -f "${RUST_PROFILE_SCRIPT}" ]; then - echo 'export PATH="$HOME/.cargo/bin:$PATH"' | ${SUDO} tee "${RUST_PROFILE_SCRIPT}" -fi -source "${RUST_PROFILE_SCRIPT}" -rustc --version -cargo --version - -echo "[SUCCESS] Rust installed successfully" - -# Start Container Runtime Service -echo "[INFO] Starting Docker service..." - -${SUDO} systemctl enable --now docker -echo "[INFO] Docker service enabled and started" - -docker --version - -echo "[INFO] Verifying ${CONTAINER_RUNTIME} is working..." -${SUDO} systemctl status ${CONTAINER_RUNTIME} --no-pager || true -sleep 5 - -if ! ${SUDO} ${CONTAINER_RUNTIME} version; then - echo "[ERROR] ${CONTAINER_RUNTIME} is not working properly" - JOURNAL_LOG="/tmp/kind-deployment-logs/${CONTAINER_RUNTIME}-journal.log" - ${SUDO} journalctl -u ${CONTAINER_RUNTIME} --no-pager -n 50 > "${JOURNAL_LOG}" - exit 1 -fi - -echo "[SUCCESS] ${CONTAINER_RUNTIME} is running" - -# Ensure containerd directories exist and have correct permissions -echo "[INFO] Ensuring containerd directories exist..." -${SUDO} mkdir -p /var/lib/containerd/io.containerd.content.v1.content/ingest -${SUDO} mkdir -p /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots -${SUDO} mkdir -p /var/lib/containerd/tmpmounts -${SUDO} chmod -R 755 /var/lib/containerd - -echo "[INFO] Restarting Docker to ensure containerd is properly initialized..." -${SUDO} systemctl restart docker -sleep 10 - -echo "[INFO] Verifying Docker and containerd after restart..." -if ! ${SUDO} docker info > /dev/null 2>&1; then - echo "[ERROR] Docker is not responding after restart" - ${SUDO} journalctl -u docker --no-pager -n 100 > /tmp/kind-deployment-logs/docker-restart-journal.log - exit 1 -fi - -echo "[SUCCESS] Docker and containerd are properly initialized" - -# Install kubectl -echo "[INFO] Installing kubectl..." - -K8S_VERSION="v1.29.0" -INSTALLED_K8S_VERSION="" -KUBECTL_PATH="/usr/local/bin/kubectl" -if command -v kubectl &> /dev/null; then - INSTALLED_K8S_VERSION=$(kubectl version --client -o=json 2>/dev/null | \ - jq -r .clientVersion.gitVersion || echo "unknown") -fi -if [[ "${INSTALLED_K8S_VERSION}" == "${K8S_VERSION}" ]]; then - echo "[INFO] kubectl is already installed at the desired version (${K8S_VERSION})." -else - echo "[INFO] Installing kubectl version ${K8S_VERSION}..." - KUBECTL_URL="https://dl.k8s.io/release/${K8S_VERSION}/bin/linux/amd64/kubectl" - - if ! curl -Lo ./kubectl "${KUBECTL_URL}"; then - echo "[ERROR] Failed to download kubectl from ${KUBECTL_URL}" - exit 1 - fi - - if file ./kubectl | grep -q "ELF.*executable"; then - echo "[INFO] Download verified: ELF executable" - else - echo "[ERROR] Downloaded file is not an ELF executable" - exit 1 - fi - - chmod +x kubectl - ${SUDO} mv kubectl "${KUBECTL_PATH}" -fi - -kubectl version --client || { - echo "[ERROR] kubectl installation failed" - exit 1 -} - -echo "[SUCCESS] kubectl installed" - -# Install Kind -echo "[INFO] Installing Kind..." -KIND_VERSION="v0.30.0" -KIND_PATH="/usr/local/bin/kind" -if [[ "$(kind version -q 2>/dev/null)" == "${KIND_VERSION}" ]]; then - echo "[INFO] kind is already installed at the desired version (${KIND_VERSION})." -else - echo "[INFO] Installing kind version ${KIND_VERSION}..." - KIND_URL="https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64" - - if ! curl -Lo ./kind "${KIND_URL}"; then - echo "[ERROR] Failed to download kind from ${KIND_URL}" - exit 1 - fi - - if file ./kind | grep -q "ELF.*executable"; then - echo "[INFO] Download verified: ELF executable" - else - echo "[ERROR] Downloaded file is not an ELF executable" - exit 1 - fi - - chmod +x kind - ${SUDO} mv kind "${KIND_PATH}" -fi - -kind version || { - echo "[ERROR] Kind installation failed" - exit 1 -} - -echo "[SUCCESS] Kind installed" - -# Install Go -echo "[INFO] Installing Go 1.25.0 or higher..." - -GO_VERSION="1.25.0" -GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz" -GO_INSTALL_DIR="/usr/local" -GO_DOWNLOAD_URL="https://go.dev/dl/${GO_TARBALL}" - -# Check if Go is already installed and meets version requirement -CURRENT_GO_VERSION="" -if command -v go &> /dev/null; then - CURRENT_GO_VERSION=$(go version | awk '{print $3}' | sed 's/go//') - echo "[INFO] Current Go version: ${CURRENT_GO_VERSION}" - - # Compare versions (simple comparison, assumes format X.Y.Z) - CURRENT_MAJOR=$(echo "${CURRENT_GO_VERSION}" | cut -d. -f1) - CURRENT_MINOR=$(echo "${CURRENT_GO_VERSION}" | cut -d. -f2) - REQUIRED_MAJOR=$(echo "${GO_VERSION}" | cut -d. -f1) - REQUIRED_MINOR=$(echo "${GO_VERSION}" | cut -d. -f2) - - if [ "${CURRENT_MAJOR}" -gt "${REQUIRED_MAJOR}" ] || \ - { [ "${CURRENT_MAJOR}" -eq "${REQUIRED_MAJOR}" ] && [ "${CURRENT_MINOR}" -ge "${REQUIRED_MINOR}" ]; }; then - echo "[INFO] Go version ${CURRENT_GO_VERSION} meets requirement (>= ${GO_VERSION})" - else - echo "[WARN] Go version ${CURRENT_GO_VERSION} is lower than required ${GO_VERSION}, upgrading..." - NEED_UPGRADE=true - fi -else - echo "[INFO] Go not found, installing version ${GO_VERSION}..." - NEED_UPGRADE=true -fi - -if [ "${NEED_UPGRADE:-false}" = "true" ]; then - echo "[INFO] Downloading Go ${GO_VERSION} from ${GO_DOWNLOAD_URL}..." - - if ! curl -L -f -o "/tmp/${GO_TARBALL}" "${GO_DOWNLOAD_URL}"; then - echo "[ERROR] Failed to download Go from ${GO_DOWNLOAD_URL}" - exit 1 - fi - - echo "[INFO] Extracting Go to ${GO_INSTALL_DIR}..." - ${SUDO} rm -rf "${GO_INSTALL_DIR}/go" - ${SUDO} tar -C "${GO_INSTALL_DIR}" -xzf "/tmp/${GO_TARBALL}" - rm -f "/tmp/${GO_TARBALL}" - - echo "[INFO] Setting up Go environment..." - export PATH="${GO_INSTALL_DIR}/go/bin:${PATH}" - - # Ensure Go is in PATH for all users - GO_PROFILE_SCRIPT="/etc/profile.d/go.sh" - if ! [ -f "${GO_PROFILE_SCRIPT}" ]; then - echo "export PATH=\"${GO_INSTALL_DIR}/go/bin:\${PATH}\"" | ${SUDO} tee "${GO_PROFILE_SCRIPT}" - fi -fi - -# Verify installation -if ! go version; then - echo "[ERROR] Go installation verification failed" - exit 1 -fi - -echo "[SUCCESS] Go installed: $(go version)" - -# Configure Go module proxy -echo "[INFO] Configuring Go module proxy..." -GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}" -GOSUMDB="${GOSUMDB:-sum.golang.org}" - -go env -w GOPROXY="${GOPROXY}" -go env -w GOSUMDB="${GOSUMDB}" - -echo "[SUCCESS] Go proxy configured: ${GOPROXY}" - -# Clone operator Repository -echo "[INFO] Cloning operator repository..." - -WORK_DIR="${HOME}/operator-kind-setup" -rm -rf "${WORK_DIR}" - -echo "[INFO] Repository: ${OPERATOR_REPO}" -echo "[INFO] Branch: ${OPERATOR_BRANCH}" - -if ! git clone --depth 1 --branch "${OPERATOR_BRANCH}" "${OPERATOR_REPO}" "${WORK_DIR}"; then - echo "[ERROR] Failed to clone repository from ${OPERATOR_REPO}" - exit 1 -fi - -cd "${WORK_DIR}" - -CURRENT_COMMIT=$(git rev-parse HEAD) -CURRENT_COMMIT_SHORT=$(git rev-parse --short HEAD) - -echo "[SUCCESS] Repository cloned to ${WORK_DIR}" -echo "[INFO] Current commit: ${CURRENT_COMMIT_SHORT} (${CURRENT_COMMIT})" -# Temporarily disable pipefail to avoid SIGPIPE from head closing the pipe early -set +o pipefail -echo "[INFO] Commit message: $(git log -1 --pretty=%B | head -1)" -set -o pipefail - -# Adapt Kind Config for External Access -echo "[INFO] Adapting kind configuration for external access..." - -cp kind/config.yaml kind/config.yaml.orig - -cat > kind/config.yaml << KINDCONFIG -kind: Cluster -apiVersion: kind.x-k8s.io/v1alpha4 -networking: - apiServerAddress: "${BEAKER_IP}" - apiServerPort: 6443 -containerdConfigPatches: -- |- - [plugins."io.containerd.grpc.v1.cri".registry] - config_path = "/etc/containerd/certs.d" -nodes: -- role: control-plane - extraPortMappings: - - containerPort: 31000 - hostPort: 8080 - protocol: TCP - - containerPort: 31001 - hostPort: 8000 - protocol: TCP -featureGates: - "ImageVolume": true -KINDCONFIG - -echo "[SUCCESS] Kind configuration adapted for external access" - -# Set Runtime Environment -export RUNTIME="${CONTAINER_RUNTIME}" -echo "[SUCCESS] Runtime environment configured: ${RUNTIME}" - -# Verify Tools Installation -echo "[INFO] Verifying installed tools..." -${RUNTIME} version -kubectl version --client -kind version -git --version -rustc --version -go version - -echo "[SUCCESS] All tools and dependencies installed successfully" - -SETUPSCRIPT - -chmod +x /tmp/beaker-setup.sh - -log_success "Deployment script generated ($(wc -l < /tmp/beaker-setup.sh) lines)" - -# Transfer Script to Beaker Machine -log_info "Transferring deployment script to Beaker machine..." - -if ! scp "${SSHOPTS[@]}" /tmp/beaker-setup.sh "${BEAKER_USER}@${BEAKER_IP}:/tmp/beaker-setup.sh"; then - log_error "Failed to transfer deployment script to Beaker machine" - exit 2 -fi - -log_success "Script transferred successfully" - -# Execute Script on Beaker Machine -log_info "Executing environment setup script on Beaker machine..." -log_info "Timeout: ${SETUP_SCRIPT_TIMEOUT} seconds" - -SETUP_CMD="export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:\${PATH:-};" -SETUP_CMD+=" [ -f /etc/profile ] && source /etc/profile 2>/dev/null || true;" -SETUP_CMD+=" bash /tmp/beaker-setup.sh" -SETUP_CMD+=" '${KIND_CLUSTER_NAME}' '${CONTAINER_RUNTIME}' '${BEAKER_IP}'" -SETUP_CMD+=" '${OPERATOR_REPO}' '${OPERATOR_BRANCH}'" - -if ! timeout "${SETUP_SCRIPT_TIMEOUT}" ssh "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}" "${SETUP_CMD}"; then - log_error "Remote deployment script failed or timed out" - CRITICAL_FAILURE=true - DEPLOYMENT_STATUS=2 -fi - -if $CRITICAL_FAILURE; then - log_error "Critical failure during deployment" - collect_deployment_logs || true - exit ${DEPLOYMENT_STATUS} -fi - -log_success "Environment preparation completed successfully" - -# Collect Deployment Logs and Artifacts -progress "Collecting logs and artifacts" - -collect_deployment_logs || log_warn "Log collection encountered errors" - -# Save Deployment Metadata -progress "Saving deployment metadata" - -cat > "${SHARED_DIR}/beaker_info" << EOFINFO -BEAKER_IP=${BEAKER_IP} -BEAKER_USER=${BEAKER_USER} -KIND_CLUSTER_NAME=${KIND_CLUSTER_NAME} -CONTAINER_RUNTIME=${CONTAINER_RUNTIME} -OPERATOR_REPO=${OPERATOR_REPO} -OPERATOR_BRANCH=${OPERATOR_BRANCH} -DEPLOYMENT_DATE="$(date -u +"%Y-%m-%d %H:%M:%S UTC")" -EOFINFO - -log_info "Deployment info saved to ${SHARED_DIR}/beaker_info" - -# Final Status Check -if $CRITICAL_FAILURE; then - echo "" - echo "==========================================" - echo "Beaker Environment Preparation - FAILED" - echo "==========================================" - echo "Exit code: ${DEPLOYMENT_STATUS}" - echo "==========================================" - exit ${DEPLOYMENT_STATUS} -fi - -echo "" -echo "==========================================" -echo "Beaker Environment Preparation - Completed Successfully" -echo "==========================================" -echo "Beaker Machine: ${BEAKER_IP}" -echo "Container Runtime: ${CONTAINER_RUNTIME}" -echo "" -echo "Installed Tools:" -echo " - Docker CE" -echo " - kubectl v1.29.0" -echo " - kind v0.30.0" -echo " - git, Rust, Go" -echo "" -echo "Operator repository:" -echo " Repository: ${OPERATOR_REPO}" -echo " Branch: ${OPERATOR_BRANCH}" -echo "==========================================" -date diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-ref.metadata.json b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-ref.metadata.json deleted file mode 100644 index e5a89cbbba249..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-ref.metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "path": "trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-ref.yaml", - "owners": { - "approvers": [ - "alicefr", - "Jakob-Naucke", - "fangge1212", - "yalzhang" - ], - "reviewers": [ - "alicefr", - "yalzhang", - "Jakob-Naucke", - "fangge1212" - ] - } -} \ No newline at end of file diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-ref.yaml b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-ref.yaml deleted file mode 100644 index f96eb17cd6996..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/beaker-kind-provision/trusted-execution-clusters-ref-operator-beaker-kind-provision-ref.yaml +++ /dev/null @@ -1,103 +0,0 @@ -ref: - as: trusted-execution-clusters-ref-operator-beaker-kind-provision - from_image: - namespace: ci - name: telco-runner - tag: latest - commands: trusted-execution-clusters-ref-operator-beaker-kind-provision-commands.sh - grace_period: 5m - credentials: - - namespace: test-credentials - name: beaker-bm - mount_path: /var/run/beaker-bm - resources: - requests: - cpu: 500m - memory: 500Mi - limits: - memory: 1Gi - env: - - name: BEAKER_IP - default: "" - documentation: |- - IP address of the Beaker provisioned bare metal machine. - If not set, the script will try to read from: - 1. Vault-mounted secret: /var/run/beaker-bm/beaker-ip (if exists) - 2. ${SHARED_DIR}/beaker_ip (fallback) - Can be provided as environment variable to override Vault value. - - name: BEAKER_USER - default: "root" - documentation: |- - Username for SSH access to the Beaker machine. - If not set, the script will try to read from: - 1. Vault-mounted secret: /var/run/beaker-bm/beaker-user (if exists) - 2. Default: root - - name: KIND_CLUSTER_NAME - default: "kind" - documentation: |- - Name of the Kind cluster to create. - Defaults to 'kind' if not specified. - - name: CONTAINER_RUNTIME - default: "docker" - documentation: |- - Container runtime to use for Kind cluster. - Currently only 'docker' is supported. - Defaults to 'docker' if not specified. - - name: OPERATOR_REPO - default: "https://github.com/trusted-execution-clusters/operator.git" - documentation: |- - Git repository URL for the trusted-execution-clusters operator project. - The kind cluster configuration will be downloaded from this repository. - - name: OPERATOR_BRANCH - default: "main" - documentation: |- - Branch of the operator repository to use. - Defaults to 'main' if not specified. - - name: KIND_CREATE_TIMEOUT - default: "900" - documentation: |- - Timeout in seconds for the entire Kind cluster creation process. - Defaults to 900 seconds (15 minutes). - - name: CLUSTER_READY_TIMEOUT - default: "300" - documentation: |- - Timeout in seconds for waiting for cluster to be ready. - Defaults to 300 seconds (5 minutes). - documentation: |- - This step prepares the environment on a pre-provisioned Beaker bare metal - machine running Fedora for deploying Kind clusters with the trusted-execution-clusters operator. - - The step performs the following operations: - 1. Connects to the Beaker machine via SSH using the provided credentials - 2. Installs necessary dependencies (Docker CE, kubectl, kind, git, Rust, Go) - 3. Downloads the operator repository with kind cluster configuration - 4. Adapts the kind configuration for external access (apiServerAddress) - 5. Verifies all tools are installed correctly - - Prerequisites: - - A Beaker machine must be pre-provisioned with Fedora - - SSH public key must be pre-configured in ~/.ssh/authorized_keys on the Beaker machine - - SSH credentials stored in Vault at: secrets/kv/selfservice/confidential-qe/beaker-bm - Required fields: - - beaker-ssh-private-key: SSH private key (corresponding public key must be on Beaker machine) - - beaker-ip: IP address of the Beaker machine - Optional fields: - - beaker-user: SSH username (defaults to 'root' if not provided) - - The Vault secret must be manually synced to K8s secret 'beaker-bm' in 'test-credentials' namespace - - The BEAKER_IP can also be provided via environment variable or ${SHARED_DIR}/beaker_ip - - Outputs: - - ${SHARED_DIR}/beaker_info: Information about the Beaker machine and installed tools - - ${ARTIFACT_DIR}/beaker-logs/: Installation logs and diagnostics - - After this step completes, the Beaker machine will have: - - Docker CE installed and running - - kubectl CLI tool installed (v1.29.0) - - kind installed (v0.30.0) - - Rust and Go development tools - - Operator repository downloaded to ~/operator-kind-setup - - kind configuration prepared at ~/operator-kind-setup/kind/config.yaml - - Note: This step only prepares the environment. To create a Kind cluster, - SSH to the Beaker machine and run: - kind create cluster --name kind --config ~/operator-kind-setup/kind/config.yaml diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-commands.sh b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-commands.sh deleted file mode 100755 index 9b7e48c421ea1..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-commands.sh +++ /dev/null @@ -1,313 +0,0 @@ -#!/bin/bash - -set -o nounset -set -o pipefail - -if [ -z "${SHARED_DIR:-}" ]; then - echo "[ERROR] SHARED_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -if [ -z "${ARTIFACT_DIR:-}" ]; then - echo "[ERROR] ARTIFACT_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -echo "==========================================" -echo "Kind Cluster Creation - Starting" -echo "==========================================" -date - -# Prow CI User Environment Setup -if ! whoami &> /dev/null; then - if [[ -w /etc/passwd ]]; then - echo "[INFO] Creating user entry for UID $(id -u) in /etc/passwd" - echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd - else - echo "[WARN] Cannot write to /etc/passwd, SSH may encounter issues" - fi -fi - -if whoami &> /dev/null; then - echo "[INFO] Current user: $(whoami) (UID: $(id -u))" -fi - -# Global Variables -DEPLOYMENT_STATUS=0 -CRITICAL_FAILURE=false - -CLUSTER_CREATE_TIMEOUT="${CLUSTER_CREATE_TIMEOUT:-900}" - -TOTAL_STEPS=7 -CURRENT_STEP=0 - -# Helper Functions -progress() { - CURRENT_STEP=$((CURRENT_STEP + 1)) - echo "" - echo "==========================================" - echo "Step ${CURRENT_STEP}/${TOTAL_STEPS}: $1" - echo "==========================================" -} - -log_info() { - echo "[INFO] $1" -} - -log_warn() { - echo "[WARN] $1" -} - -log_error() { - echo "[ERROR] $1" -} - -log_success() { - echo "[SUCCESS] $1" -} - -# Read Configuration from Previous Step -progress "Reading configuration from previous step" - -if [ ! -f "${SHARED_DIR}/beaker_info" ]; then - log_error "beaker_info not found. The beaker-kind-provision step must run first." - exit 1 -fi - -source "${SHARED_DIR}/beaker_info" - -log_info "Beaker machine: ${BEAKER_IP}" -log_info "Beaker user: ${BEAKER_USER}" -log_info "Kind cluster name: ${KIND_CLUSTER_NAME}" -log_info "Container runtime: ${CONTAINER_RUNTIME}" - -# SSH Key Setup -progress "Setting up SSH key" - -SSH_PKEY_PATH_VAULT="/var/run/beaker-bm/beaker-ssh-private-key" - -if [ -f "${SSH_PKEY_PATH_VAULT}" ]; then - SSH_PKEY_PATH="${SSH_PKEY_PATH_VAULT}" - log_info "Using SSH key from Vault: ${SSH_PKEY_PATH_VAULT}" -elif [ -n "${CLUSTER_PROFILE_DIR:-}" ] && [ -f "${CLUSTER_PROFILE_DIR}/ssh-key" ]; then - SSH_PKEY_PATH="${CLUSTER_PROFILE_DIR}/ssh-key" - log_info "Using SSH key from CLUSTER_PROFILE_DIR: ${CLUSTER_PROFILE_DIR}/ssh-key" -else - log_error "SSH key not found" - exit 1 -fi - -SSH_PKEY="${HOME}/.ssh/beaker_key" -mkdir -p "${HOME}/.ssh" -cp "${SSH_PKEY_PATH}" "${SSH_PKEY}" -chmod 600 "${SSH_PKEY}" -log_info "SSH private key configured at ${SSH_PKEY}" - -# SSH Options Configuration -progress "Configuring SSH connection" - -SSHOPTS=( - -o 'ConnectTimeout=120' - -o 'StrictHostKeyChecking=no' - -o 'UserKnownHostsFile=/dev/null' - -o 'ServerAliveInterval=30' - -o 'ServerAliveCountMax=5' - -o 'LogLevel=ERROR' - -i "${SSH_PKEY}" -) - -log_info "SSH connection timeout set to 120 seconds" - -# SSH Connection Test with Retry -progress "Establishing SSH connection to Beaker machine" - -log_info "Testing SSH connection to ${BEAKER_USER}@${BEAKER_IP}..." - -MAX_SSH_ATTEMPTS=15 -BASE_RETRY_DELAY=5 - -for attempt in $(seq 1 $MAX_SSH_ATTEMPTS); do - RETRY_DELAY=$((BASE_RETRY_DELAY * attempt / 3)) - [ $RETRY_DELAY -gt 30 ] && RETRY_DELAY=30 - - if ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "echo 'SSH test successful'; hostname; uptime"; then - log_success "SSH connection established after ${attempt} attempt(s)" - break - else - if [[ $attempt -eq $MAX_SSH_ATTEMPTS ]]; then - log_error "Failed to establish SSH connection after ${MAX_SSH_ATTEMPTS} attempts" - CRITICAL_FAILURE=true - DEPLOYMENT_STATUS=1 - exit 1 - fi - log_warn "SSH connection failed, attempt ${attempt}/${MAX_SSH_ATTEMPTS}. Retrying in ${RETRY_DELAY} seconds..." - sleep $RETRY_DELAY - fi -done - -# Create Kind Cluster on Beaker Machine -progress "Creating Kind cluster on Beaker machine" - -log_info "Executing 'make cluster-up' on Beaker machine..." - -if ! timeout "${CLUSTER_CREATE_TIMEOUT}" ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s -- \ - "${KIND_CLUSTER_NAME}" "${CONTAINER_RUNTIME}" "${BEAKER_IP}" << 'EOF' - -set -o nounset -set -o pipefail -set -x - -KIND_CLUSTER_NAME="$1" -CONTAINER_RUNTIME="$2" -BEAKER_IP="$3" - -echo "==========================================" -echo "Running on Beaker machine: $(hostname)" -echo "Beaker IP: ${BEAKER_IP}" -echo "Runtime: ${CONTAINER_RUNTIME}" -echo "Cluster name: ${KIND_CLUSTER_NAME}" -echo "Date: $(date)" -echo "==========================================" - -if [ "$(id -u)" -eq 0 ]; then - SUDO="" -else - if command -v sudo &> /dev/null; then - SUDO="sudo" - else - SUDO="" - fi -fi - -mkdir -p /tmp/kind-cluster-logs -exec > >(tee -a /tmp/kind-cluster-logs/cluster-creation.log) -exec 2>&1 - -WORK_DIR="${HOME}/operator-kind-setup" -if [ ! -d "${WORK_DIR}" ]; then - echo "[ERROR] Operator directory not found: ${WORK_DIR}" - exit 1 -fi - -cd "${WORK_DIR}" - -if [ -f "/etc/profile.d/go.sh" ]; then - source "/etc/profile.d/go.sh" -fi -if [ -f "/etc/profile.d/rust.sh" ]; then - source "/etc/profile.d/rust.sh" -fi - -export IP="192.168.122.1" -export RUNTIME="${CONTAINER_RUNTIME}" - -echo "[INFO] Environment configured:" -echo " IP=${IP}" -echo " RUNTIME=${RUNTIME}" - -echo "[INFO] Cleaning up any existing Kind resources before creation..." -echo "[INFO] This ensures a clean state even if previous deprovision failed" - -# Force delete any existing Kind cluster -if command -v kind &> /dev/null; then - kind delete cluster --name "${KIND_CLUSTER_NAME}" 2>&1 || echo "[INFO] No existing cluster to delete" -fi - -# Force remove Kind containers by name (handles cases where cluster delete failed) -if command -v docker &> /dev/null; then - docker rm -f kind-control-plane 2>&1 || echo "[INFO] No kind-control-plane container" - docker rm -f kind-registry 2>&1 || echo "[INFO] No kind-registry container" - - # Remove any other containers with "kind" in the name - docker ps -aq --filter "name=kind" | xargs -r docker rm -f 2>&1 || echo "[INFO] No other kind-named containers" - - # Remove Kind network (critical - prevents "network already exists" errors) - docker network rm kind 2>&1 || echo "[INFO] No kind network to remove" -fi - -echo "[SUCCESS] Pre-creation cleanup completed" - -echo "[INFO] Executing: make cluster-up RUNTIME=${RUNTIME}" -if ! make cluster-up RUNTIME="${RUNTIME}"; then - echo "[ERROR] 'make cluster-up' failed" - exit 1 -fi - -echo "[SUCCESS] Kind cluster created successfully" - -echo "[INFO] Verifying cluster access..." -export KUBECONFIG="${HOME}/.kube/config" - -if ! kubectl cluster-info; then - echo "[ERROR] Cannot access cluster" - exit 1 -fi - -echo "[INFO] Checking node status..." -kubectl get nodes -o wide - -echo "[SUCCESS] Cluster is ready and accessible" - -EOF -then - log_error "Cluster creation failed or timed out" - CRITICAL_FAILURE=true - DEPLOYMENT_STATUS=1 -fi - -if $CRITICAL_FAILURE; then - log_error "Critical failure during cluster creation" - - mkdir -p "${ARTIFACT_DIR}/kind-cluster-logs" - scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/kind-cluster-logs/*.log" \ - "${ARTIFACT_DIR}/kind-cluster-logs/" 2>&1 || log_warn "Failed to collect cluster creation logs" - - exit ${DEPLOYMENT_STATUS} -fi - -log_success "Kind cluster created successfully on Beaker machine" - -# Retrieve Kubeconfig from Beaker Machine -progress "Retrieving kubeconfig from Beaker machine" - -log_info "Copying kubeconfig from ${BEAKER_USER}@${BEAKER_IP}..." - -if ! scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:.kube/config" \ - "${SHARED_DIR}/kubeconfig"; then - log_error "Failed to retrieve kubeconfig from Beaker machine" - CRITICAL_FAILURE=true - DEPLOYMENT_STATUS=1 - exit ${DEPLOYMENT_STATUS} -fi - -log_success "Kubeconfig saved to ${SHARED_DIR}/kubeconfig" - -log_info "Note: Cluster is only accessible from the Beaker machine, not from CI pod" -log_info "The kubeconfig is saved for use by subsequent steps running on Beaker machine" - -# Collect Cluster Logs -progress "Collecting cluster creation logs" - -mkdir -p "${ARTIFACT_DIR}/kind-cluster-logs" - -scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/kind-cluster-logs/*.log" \ - "${ARTIFACT_DIR}/kind-cluster-logs/" 2>&1 || log_warn "Failed to collect some logs" - -log_success "Logs collected to ${ARTIFACT_DIR}/kind-cluster-logs/" -log_info "Cluster resources will be collected by subsequent steps running on Beaker machine" - -# Final Status -echo "" -echo "==========================================" -echo "Kind Cluster Creation - Completed Successfully" -echo "==========================================" -echo "Cluster Name: ${KIND_CLUSTER_NAME}" -echo "Beaker Machine: ${BEAKER_IP}" -echo "Container Runtime: ${CONTAINER_RUNTIME}" -echo "" -echo "Kubeconfig: ${SHARED_DIR}/kubeconfig" -echo "==========================================" -date diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-ref.metadata.json b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-ref.metadata.json deleted file mode 100644 index d6e313eefdccb..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-ref.metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "path": "trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-ref.yaml", - "owners": { - "approvers": [ - "alicefr", - "Jakob-Naucke", - "fangge1212", - "yalzhang" - ], - "reviewers": [ - "alicefr", - "yalzhang", - "Jakob-Naucke", - "fangge1212" - ] - } -} \ No newline at end of file diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-ref.yaml b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-ref.yaml deleted file mode 100644 index 55a84d3bb3cbb..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/kind-cluster-create/trusted-execution-clusters-ref-operator-kind-cluster-create-ref.yaml +++ /dev/null @@ -1,69 +0,0 @@ -ref: - as: trusted-execution-clusters-ref-operator-kind-cluster-create - from_image: - namespace: ci - name: telco-runner - tag: latest - commands: trusted-execution-clusters-ref-operator-kind-cluster-create-commands.sh - credentials: - - namespace: test-credentials - name: beaker-bm - mount_path: /var/run/beaker-bm - resources: - requests: - cpu: 500m - memory: 500Mi - limits: - memory: 1Gi - env: - - name: BEAKER_IP - default: "" - documentation: |- - IP address of the Beaker provisioned bare metal machine. - If not set, will read from Vault or ${SHARED_DIR}/beaker_ip - - name: BEAKER_USER - default: "root" - documentation: |- - Username for SSH access to the Beaker machine. - If not set, will read from Vault or default to 'root' - - name: KIND_CLUSTER_NAME - default: "kind" - documentation: |- - Name of the Kind cluster to create. - Defaults to 'kind' if not specified. - - name: CONTAINER_RUNTIME - default: "docker" - documentation: |- - Container runtime to use for Kind cluster (docker only). - Defaults to 'docker' if not specified. - - name: CLUSTER_CREATE_TIMEOUT - default: "900" - documentation: |- - Timeout in seconds for the kind cluster creation process. - Defaults to 900 seconds (15 minutes). - documentation: |- - This step creates a Kind (Kubernetes in Docker) cluster on a pre-provisioned - Beaker bare metal machine that has been prepared by the beaker-kind-provision step. - - The step performs the following operations: - 1. Connects to the Beaker machine via SSH using credentials from Vault - 2. Navigates to the operator repository directory - 3. Executes 'make cluster-up' to create the Kind cluster on the Beaker machine - 4. Retrieves the kubeconfig and saves it to ${SHARED_DIR}/kubeconfig - 5. Collects cluster creation logs from the Beaker machine - - Prerequisites: - - The beaker-kind-provision step must have completed successfully - - Beaker machine must have operator repository downloaded - - kind configuration must be ready at ~/operator-kind-setup/kind/config.yaml - - SSH credentials must be available in Vault - - Outputs: - - ${SHARED_DIR}/kubeconfig: Kubernetes configuration file for accessing the cluster - - ${ARTIFACT_DIR}/kind-cluster-logs/: Cluster creation logs and diagnostics - - Important Notes: - - The Kind cluster is configured to listen on the Beaker machine's external IP address - - The cluster is NOT directly accessible from the CI pod due to network isolation - - The kubeconfig is saved for use by subsequent steps that run commands on the Beaker machine via SSH - - Cluster verification (kubectl cluster-info) is performed on the Beaker machine, not from the CI pod diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/OWNERS b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/OWNERS deleted file mode 100644 index 79b41a3630fd0..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -reviewers: - - alicefr - - yalzhang - - Jakob-Naucke - - fangge1212 -approvers: - - alicefr - - Jakob-Naucke - - fangge1212 - - yalzhang diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-commands.sh b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-commands.sh deleted file mode 100755 index 165c83e5feb7a..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-commands.sh +++ /dev/null @@ -1,715 +0,0 @@ -#!/bin/bash - -set -o nounset -set -o pipefail - -if [ -z "${SHARED_DIR:-}" ]; then - echo "[ERROR] SHARED_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -if [ -z "${ARTIFACT_DIR:-}" ]; then - echo "[ERROR] ARTIFACT_DIR is not set. This script must run in Prow CI environment." - exit 1 -fi - -echo "==========================================" -echo "Operator Integration Tests - Starting" -echo "==========================================" -date - -# Prow CI User Environment Setup -if ! whoami &> /dev/null; then - if [[ -w /etc/passwd ]]; then - echo "[INFO] Creating user entry for UID $(id -u) in /etc/passwd" - echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd - else - echo "[WARN] Cannot write to /etc/passwd, SSH may encounter issues" - fi -fi - -if whoami &> /dev/null; then - echo "[INFO] Current user: $(whoami) (UID: $(id -u))" -fi - -# Global Variables -DEPLOYMENT_STATUS=0 -CRITICAL_FAILURE=false - -POD_READY_TIMEOUT="${POD_READY_TIMEOUT:-900}" - -OPERATOR_REPO="${OPERATOR_REPO:-https://github.com/trusted-execution-clusters/operator.git}" -OPERATOR_BRANCH="${OPERATOR_BRANCH:-main}" - -OPERATOR_NAMESPACE="${OPERATOR_NAMESPACE:-}" - -TOTAL_STEPS=7 -CURRENT_STEP=0 - -# Helper Functions -progress() { - CURRENT_STEP=$((CURRENT_STEP + 1)) - echo "" - echo "==========================================" - echo "Step ${CURRENT_STEP}/${TOTAL_STEPS}: $1" - echo "==========================================" -} - -log_info() { - echo "[INFO] $1" -} - -log_warn() { - echo "[WARN] $1" -} - -log_error() { - echo "[ERROR] $1" -} - -log_success() { - echo "[SUCCESS] $1" -} - -# Read Configuration from Previous Step -progress "Reading configuration from previous step" - -if [ ! -f "${SHARED_DIR}/beaker_info" ]; then - log_error "beaker_info not found. The beaker-kind-provision step must run first." - exit 1 -fi - -source "${SHARED_DIR}/beaker_info" - -log_info "Beaker machine: ${BEAKER_IP}" -log_info "Beaker user: ${BEAKER_USER}" -log_info "Container runtime: ${CONTAINER_RUNTIME}" -log_info "Operator repository: ${OPERATOR_REPO}" -log_info "Operator branch: ${OPERATOR_BRANCH}" - -# SSH Key Setup -progress "Setting up SSH key" - -SSH_PKEY_PATH_VAULT="/var/run/beaker-bm/beaker-ssh-private-key" - -if [ -f "${SSH_PKEY_PATH_VAULT}" ]; then - SSH_PKEY_PATH="${SSH_PKEY_PATH_VAULT}" - log_info "Using SSH key from Vault: ${SSH_PKEY_PATH_VAULT}" -elif [ -n "${CLUSTER_PROFILE_DIR:-}" ] && [ -f "${CLUSTER_PROFILE_DIR}/ssh-key" ]; then - SSH_PKEY_PATH="${CLUSTER_PROFILE_DIR}/ssh-key" - log_info "Using SSH key from CLUSTER_PROFILE_DIR: ${CLUSTER_PROFILE_DIR}/ssh-key" -else - log_error "SSH key not found" - exit 1 -fi - -SSH_PKEY="${HOME}/.ssh/beaker_key" -mkdir -p "${HOME}/.ssh" -cp "${SSH_PKEY_PATH}" "${SSH_PKEY}" -chmod 600 "${SSH_PKEY}" -log_info "SSH private key configured at ${SSH_PKEY}" - -# SSH Options Configuration -progress "Configuring SSH connection" - -SSHOPTS=( - -o 'ConnectTimeout=120' - -o 'StrictHostKeyChecking=no' - -o 'UserKnownHostsFile=/dev/null' - -o 'ServerAliveInterval=30' - -o 'ServerAliveCountMax=5' - -o 'LogLevel=ERROR' - -i "${SSH_PKEY}" -) - -log_info "SSH connection timeout set to 120 seconds" - -# SSH Connection Test with Retry -progress "Establishing SSH connection to Beaker machine" - -log_info "Testing SSH connection to ${BEAKER_USER}@${BEAKER_IP}..." - -MAX_SSH_ATTEMPTS=15 -BASE_RETRY_DELAY=5 - -for attempt in $(seq 1 $MAX_SSH_ATTEMPTS); do - RETRY_DELAY=$((BASE_RETRY_DELAY * attempt / 3)) - [ $RETRY_DELAY -gt 30 ] && RETRY_DELAY=30 - - if ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "echo 'SSH test successful'; hostname; uptime"; then - log_success "SSH connection established after ${attempt} attempt(s)" - break - else - if [[ $attempt -eq $MAX_SSH_ATTEMPTS ]]; then - log_error "Failed to establish SSH connection after ${MAX_SSH_ATTEMPTS} attempts" - CRITICAL_FAILURE=true - DEPLOYMENT_STATUS=1 - exit 1 - fi - log_warn "SSH connection failed, attempt ${attempt}/${MAX_SSH_ATTEMPTS}. Retrying in ${RETRY_DELAY} seconds..." - sleep $RETRY_DELAY - fi -done - -# Transfer PR Source Code to Beaker Machine -progress "Transferring PR code to Beaker machine" - -# Determine if this is a real operator PR or a rehearsal -# In rehearsals, PULL_NUMBER is set but refers to the release repo PR, not operator PR -REPO_OWNER="${REPO_OWNER:-}" -REPO_NAME="${REPO_NAME:-}" - -log_info "Job context: REPO_OWNER=${REPO_OWNER}, REPO_NAME=${REPO_NAME}, PULL_NUMBER=${PULL_NUMBER:-unset}" - -# Check if this is a presubmit for the actual operator repo -if [ -n "${PULL_NUMBER:-}" ] && [ "${REPO_OWNER}" = "trusted-execution-clusters" ] && [ "${REPO_NAME}" = "operator" ]; then - # This is a real operator PR - transfer PR code from test pod to Beaker - log_info "Detected operator repo presubmit - PR #${PULL_NUMBER}" - log_info "Transferring PR code from test pod to Beaker machine" - - # In OpenShift CI, the PR code is already checked out in the test pod - # Standard location: /go/src/github.com// - PR_CODE_PATH="/go/src/github.com/${REPO_OWNER}/${REPO_NAME}" - - log_info "PR code location in test pod: ${PR_CODE_PATH}" - - if [ ! -d "${PR_CODE_PATH}" ]; then - log_error "PR code not found at ${PR_CODE_PATH}" - log_info "Checking current directory as fallback..." - if [ -f "Makefile" ] && [ -f "Cargo.toml" ]; then - PR_CODE_PATH="$(pwd)" - log_info "Found operator code in current directory: ${PR_CODE_PATH}" - else - log_error "Could not locate operator repository code in test pod" - exit 1 - fi - fi - - # Create tarball of PR code - log_info "Creating tarball of PR code..." - PR_TARBALL="/tmp/operator-pr-code.tar.gz" - if ! tar -czf "${PR_TARBALL}" -C "$(dirname ${PR_CODE_PATH})" "$(basename ${PR_CODE_PATH})"; then - log_error "Failed to create tarball of PR code" - exit 1 - fi - - TARBALL_SIZE=$(du -h "${PR_TARBALL}" | cut -f1) - log_success "Tarball created: ${PR_TARBALL} (${TARBALL_SIZE})" - - # Transfer tarball to Beaker machine - log_info "Transferring tarball to Beaker machine..." - if ! scp "${SSHOPTS[@]}" "${PR_TARBALL}" "${BEAKER_USER}@${BEAKER_IP}:/tmp/operator-pr-code.tar.gz"; then - log_error "Failed to transfer PR code tarball to Beaker machine" - exit 1 - fi - - log_success "Tarball transferred successfully" - - # Cleanup tarball from test pod - rm -f "${PR_TARBALL}" - log_info "Cleaned up tarball from test pod" - - # Extract on Beaker machine to separate directory - log_info "Extracting PR code on Beaker machine..." - - EXTRACT_OUTPUT=$(ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s << 'EXTRACT_EOF' -set -euo pipefail - -# Separate directories: -# - ~/operator-kind-setup: Infrastructure (cluster, customized kind/config.yaml) - from provision step -# - ~/operator-pr-code: PR code to test - extracted from test pod -PR_CODE_DIR="${HOME}/operator-pr-code" - -echo "[INFO] Extracting PR code to ${PR_CODE_DIR}" - -# Remove old directory if exists -rm -rf "${PR_CODE_DIR}" - -# Extract tarball -if ! tar -xzf /tmp/operator-pr-code.tar.gz -C "${HOME}"; then - echo "[ERROR] Failed to extract tarball" - exit 1 -fi - -# Rename extracted directory to operator-pr-code (in case it has a different name) -# Temporarily disable pipefail to avoid SIGPIPE from head closing the pipe early -set +o pipefail -EXTRACTED_DIR=$(tar -tzf /tmp/operator-pr-code.tar.gz | head -1 | cut -d/ -f1) -set -o pipefail -if [ "${EXTRACTED_DIR}" != "operator-pr-code" ] && [ -d "${HOME}/${EXTRACTED_DIR}" ]; then - mv "${HOME}/${EXTRACTED_DIR}" "${PR_CODE_DIR}" -fi - -# Verify extraction -if [ ! -d "${PR_CODE_DIR}" ]; then - echo "[ERROR] PR code directory not found after extraction" - exit 1 -fi - -cd "${PR_CODE_DIR}" - -echo "[SUCCESS] PR code extracted successfully" -echo "" -echo "[INFO] PR code directory: ${PR_CODE_DIR}" -echo "[INFO] Repository structure:" -[ -f "Makefile" ] && echo " ✓ Makefile" || echo " ✗ Makefile missing" -[ -f "Cargo.toml" ] && echo " ✓ Cargo.toml" || echo " ✗ Cargo.toml missing" -[ -d "src" ] && echo " ✓ src/" || echo " ✗ src/ missing" -echo "" - -# Show git info if available -if [ -d ".git" ]; then - echo "[INFO] Git information:" - git log -1 --pretty=format:" Commit: %h - %s%n Author: %an%n Date: %ad%n" --date=short 2>/dev/null || echo " (git info unavailable)" - echo "" -fi - -# Cleanup tarball after successful extraction -echo "[INFO] Cleaning up tarball..." -rm -f /tmp/operator-pr-code.tar.gz - -echo "==========================================" -echo "PR Code Ready for Testing" -echo "==========================================" -echo "Infrastructure: ~/operator-kind-setup (cluster, config)" -echo "Test code: ~/operator-pr-code (PR code)" -echo "==========================================" - -exit 0 -EXTRACT_EOF - ) - - EXTRACT_STATUS=$? - echo "${EXTRACT_OUTPUT}" - - if [ ${EXTRACT_STATUS} -eq 0 ]; then - log_success "PR code ready on Beaker machine at ~/operator-pr-code" - TEST_DIR="operator-pr-code" - else - log_error "Failed to extract PR code on Beaker machine" - exit 1 - fi -else - # Not an operator PR - this is a rehearsal, periodic, or postsubmit - if [ -n "${PULL_NUMBER:-}" ]; then - log_info "PULL_NUMBER is set to ${PULL_NUMBER}, but REPO is ${REPO_OWNER}/${REPO_NAME}" - log_info "This is a rehearsal (testing release repo PR against operator main branch)" - else - log_info "PULL_NUMBER not set - periodic or postsubmit job" - fi - - log_info "Will use existing code on Beaker machine (main branch from provision step)" - log_info "Skipping PR checkout - proceeding with main branch testing" - - # Show what code will be tested - CURRENT_CODE=$(ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" \ - "cd ~/operator-kind-setup && git log -1 --pretty=format:'%h - %s (%an, %ad)' --date=short 2>/dev/null || echo 'Git info unavailable'") - - log_info "==========================================" - log_info "CODE TO BE TESTED (main branch):" - log_info " ${CURRENT_CODE}" - log_info "==========================================" - - TEST_DIR="operator-kind-setup" -fi # End of operator PR check - -# Install Operator on Beaker Machine -progress "Running operator integration tests on Beaker machine" - -log_info "Executing operator integration tests on Beaker machine..." -log_info "Test directory: ${TEST_DIR}" - -if ! ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" bash -s -- \ - "${CONTAINER_RUNTIME}" "${POD_READY_TIMEOUT}" "${TEST_DIR}" << 'EOF' - -set -euo pipefail -set -x - -CONTAINER_RUNTIME="$1" -POD_READY_TIMEOUT="$2" -TEST_DIR="$3" - -echo "==========================================" -echo "Running on Beaker machine: $(hostname)" -echo "Date: $(date)" -echo "==========================================" - -mkdir -p /tmp/operator-install-logs -exec > >(tee -a /tmp/operator-install-logs/installation.log) -exec 2>&1 - -if [ -f "/etc/profile.d/go.sh" ]; then - source "/etc/profile.d/go.sh" -fi -if [ -f "/etc/profile.d/rust.sh" ]; then - source "/etc/profile.d/rust.sh" -fi - -if [ "$(id -u)" -eq 0 ]; then - SUDO="" -else - SUDO="sudo" -fi - -# Use Test Directory (either operator-pr-code for PRs or operator-kind-setup for main) -WORK_DIR="${HOME}/${TEST_DIR}" - -echo "[INFO] Using test code from: ${WORK_DIR}" - -if [ ! -d "${WORK_DIR}" ]; then - echo "[ERROR] Test directory not found at ${WORK_DIR}" - exit 1 -fi - -cd "${WORK_DIR}" - -echo "[SUCCESS] Using operator code at ${WORK_DIR}" -echo "[INFO] Working directory contents:" -# Temporarily disable pipefail to avoid SIGPIPE from head closing the pipe early -set +o pipefail -ls -la | head -15 -set -o pipefail - -# Verify Local Registry -echo "[INFO] Verifying local registry is accessible..." - -REG_PORT="5000" -if curl -s http://localhost:${REG_PORT}/v2/_catalog >/dev/null 2>&1; then - echo "[SUCCESS] Registry is accessible at localhost:${REG_PORT}" -else - echo "[ERROR] Registry is not accessible at localhost:${REG_PORT}" - exit 1 -fi - -# Deploy Operator and Run Integration Tests -export RUNTIME="${CONTAINER_RUNTIME}" -export CONTAINER_CLI="${CONTAINER_RUNTIME}" -export REGISTRY=localhost:5000/trusted-execution-clusters - -export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}" -export GOSUMDB="${GOSUMDB:-sum.golang.org}" - -echo "[INFO] Environment configured:" -echo " CONTAINER_CLI=${CONTAINER_CLI}" -echo " RUNTIME=${RUNTIME}" -echo " REGISTRY=${REGISTRY}" - -echo "[INFO] Verifying Kind cluster is running..." -if ! kubectl cluster-info; then - echo "[ERROR] Kind cluster is not accessible" - echo "[INFO] Attempting to create cluster..." - if ! make cluster-up; then - echo "[ERROR] 'make cluster-up' failed" - exit 1 - fi -fi - -echo "[SUCCESS] Kind cluster is ready" - -echo "[INFO] Building and pushing container images to ${REGISTRY}..." -if ! make push; then - echo "[ERROR] 'make push' failed" - exit 1 -fi - -echo "[SUCCESS] Images built and pushed" - -echo "[INFO] Installing KubeVirt..." -if ! make install-kubevirt; then - echo "[ERROR] 'make install-kubevirt' failed" - exit 1 -fi - -echo "[SUCCESS] KubeVirt installed" - -echo "[INFO] Installing virtctl CLI..." -KUBEVIRT_VERSION=$(kubectl get kubevirt.kubevirt.io/kubevirt -n kubevirt -o jsonpath="{.status.observedKubeVirtVersion}" 2>/dev/null || echo "v1.1.1") -echo "[INFO] Detected KubeVirt version: ${KUBEVIRT_VERSION}" - -VIRTCTL_URL="https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}/virtctl-${KUBEVIRT_VERSION}-linux-amd64" -if ! curl -L -o /tmp/virtctl "${VIRTCTL_URL}"; then - echo "[ERROR] Failed to download virtctl from ${VIRTCTL_URL}" - exit 1 -fi - -chmod +x /tmp/virtctl -${SUDO} mv /tmp/virtctl /usr/local/bin/virtctl - -if virtctl version --client; then - echo "[SUCCESS] virtctl installed successfully" -else - echo "[ERROR] virtctl installation verification failed" - exit 1 -fi - -echo "[INFO] Installing additional dependencies for Rust build..." -if ! sudo dnf install -y gcc-c++ openssl-devel pkg-config; then - echo "[ERROR] Failed to install build dependencies" - exit 1 -fi - -echo "[SUCCESS] Build dependencies installed" - -echo "[INFO] Generating CRDs for Rust..." -if ! make crds-rs; then - echo "[ERROR] 'make crds-rs' failed" - exit 1 -fi - -echo "[SUCCESS] Rust CRDs generated" - -echo "[INFO] Setting up SSH agent for integration tests..." - -# Kill all existing ssh-agents to ensure clean state (important for CI) -# This prevents inheriting broken agents from previous failed jobs -echo "[INFO] Cleaning up any existing ssh-agent processes..." -${SUDO} pkill -u $(whoami) ssh-agent 2>/dev/null || echo "[INFO] No existing ssh-agent processes found" - -# Wait for processes to terminate -sleep 1 - -# Start fresh ssh-agent -echo "[INFO] Starting new ssh-agent..." -eval "$(ssh-agent -s)" - -# Verify ssh-agent started successfully -if [ -z "${SSH_AGENT_PID:-}" ]; then - echo "[ERROR] Failed to start ssh-agent (SSH_AGENT_PID not set)" - exit 1 -fi - -echo "[SUCCESS] SSH agent started (PID: ${SSH_AGENT_PID})" - -# Add default SSH keys to agent -echo "[INFO] Adding SSH keys to agent..." -if ssh-add &1; then - echo "[SUCCESS] SSH keys added successfully" - echo "[INFO] Loaded keys:" - # Temporarily disable pipefail to avoid SIGPIPE from head closing the pipe early - set +o pipefail - ssh-add -l 2>&1 | head -3 | sed 's/^/ /' - set -o pipefail -else - # ssh-add might fail if no default keys exist, which is OK - # The agent will still work for SSH connections using key files directly - echo "[WARN] ssh-add had no default keys to add (this may be expected)" -fi - -# Final verification -echo "[INFO] SSH_AUTH_SOCK: ${SSH_AUTH_SOCK}" -echo "[INFO] SSH agent ready for integration tests" - -echo "[INFO] Pre-loading test images to Kind node (non-critical optimization step)..." -# IMPORTANT: This entire section is non-critical and must not fail the test -# Load the images using 'docker exec kind-control-plane crictl pull ' as a workaround, -# since loading image and using them as image volumes doesn't work. See https://github.com/kubernetes-sigs/kind/issues/4099 - -# Extract test images from Makefile -TEST_IMAGE="" -APPROVED_IMAGE="" - -if [ -f "Makefile" ]; then - TEST_IMAGE=$(grep -oP '^TEST_IMAGE\s*\?=\s*\K.*' Makefile 2>/dev/null | tr -d '[:space:]' || true) - if [ -n "${TEST_IMAGE}" ]; then - echo "[SUCCESS] Found TEST_IMAGE from Makefile: ${TEST_IMAGE}" - else - echo "[WARN] Could not extract TEST_IMAGE from Makefile" - fi - - APPROVED_IMAGE=$(grep -oP '^APPROVED_IMAGE\s*\?=\s*\K.*' Makefile 2>/dev/null | tr -d '[:space:]' || true) - if [ -n "${APPROVED_IMAGE}" ]; then - echo "[SUCCESS] Found APPROVED_IMAGE from Makefile: ${APPROVED_IMAGE}" - else - echo "[WARN] Could not extract APPROVED_IMAGE from Makefile" - fi -else - echo "[WARN] Makefile not found" -fi - -if [ -n "${TEST_IMAGE}" ]; then - echo "[INFO] Pre-loading TEST_IMAGE to Kind node..." - if docker exec kind-control-plane crictl pull "${TEST_IMAGE}" 2>/dev/null || true; then - echo "[SUCCESS] TEST_IMAGE pre-loaded: ${TEST_IMAGE}" - else - echo "[WARN] Failed to pre-load TEST_IMAGE (non-critical, test will pull it later)" - fi -fi - -if [ -n "${APPROVED_IMAGE}" ]; then - echo "[INFO] Pre-loading APPROVED_IMAGE to Kind node..." - if docker exec kind-control-plane crictl pull "${APPROVED_IMAGE}" 2>/dev/null || true; then - echo "[SUCCESS] APPROVED_IMAGE pre-loaded: ${APPROVED_IMAGE}" - else - echo "[WARN] Failed to pre-load APPROVED_IMAGE (non-critical, test will pull it later)" - fi -fi - -echo "[INFO] Image pre-loading completed (failures here do not affect test outcome)" - -echo "[INFO] Running integration tests..." -TEST_EXIT_CODE=0 -make integration-tests || TEST_EXIT_CODE=$? - -# ============================================================ -# Collect diagnostics from remaining test namespaces (failed tests) -# Passed tests cleanup their namespaces, so only failed test namespaces remain -# ============================================================ -echo "[INFO] Checking for remaining test namespaces (failed tests)..." - -TEST_NAMESPACES=$(kubectl get namespaces -o name 2>/dev/null | grep "namespace/test-" | cut -d/ -f2 || true) - -if [ -n "${TEST_NAMESPACES}" ]; then - NAMESPACE_COUNT=$(echo "${TEST_NAMESPACES}" | wc -l) - echo "[INFO] Found ${NAMESPACE_COUNT} test namespace(s) - these tests failed:" - echo "${TEST_NAMESPACES}" | sed 's/^/ - /' - - echo "[INFO] Collecting diagnostics using must-gather..." - MUST_GATHER_DIR="/tmp/must-gather-failed-tests-$(date +%Y%m%d-%H%M%S)" - mkdir -p "${MUST_GATHER_DIR}" - - if [ -f "./must-gather/gather" ]; then - # Run the existing gather script - it will collect from all namespaces - # Since only failed test namespaces remain, it will only collect those - echo "[INFO] Running must-gather/gather script..." - KUBECTL=kubectl COLLECTION_PATH="${MUST_GATHER_DIR}" ./must-gather/gather 2>&1 | tee "${MUST_GATHER_DIR}/gather.log" - - if [ $? -eq 0 ]; then - echo "[SUCCESS] Diagnostics collected to ${MUST_GATHER_DIR}" - - # Show what was collected - echo "[INFO] Collected diagnostics:" - # Temporarily disable pipefail to avoid SIGPIPE from head closing the pipe early - set +o pipefail - find "${MUST_GATHER_DIR}" -type f 2>/dev/null | sed 's/^/ /' | head -20 - set -o pipefail - FILE_COUNT=$(find "${MUST_GATHER_DIR}" -type f 2>/dev/null | wc -l) - [ ${FILE_COUNT} -gt 20 ] && echo " ... and $((FILE_COUNT - 20)) more files" - else - echo "[WARN] must-gather encountered some errors (check ${MUST_GATHER_DIR}/gather.log)" - fi - else - echo "[WARN] must-gather/gather script not found, using fallback kubectl collection..." - - # Fallback: simple kubectl collection - for ns in ${TEST_NAMESPACES}; do - echo "[INFO] Collecting diagnostics from namespace: ${ns}" - kubectl get all,trustedexecutioncluster,approvedimage,machine,virtualmachine,virtualmachineinstance \ - -n "${ns}" -o yaml > "${MUST_GATHER_DIR}/resources-${ns}.yaml" 2>&1 || true - kubectl get events -n "${ns}" --sort-by='.lastTimestamp' \ - > "${MUST_GATHER_DIR}/events-${ns}.txt" 2>&1 || true - kubectl logs -n "${ns}" --all-containers=true --prefix=true --tail=-1 \ - > "${MUST_GATHER_DIR}/logs-${ns}.txt" 2>&1 || true - done - echo "[SUCCESS] Basic diagnostics collected to ${MUST_GATHER_DIR}" - fi - - # Summary - echo "" - echo "==========================================" - echo "Diagnostic Collection Summary" - echo "==========================================" - echo "Failed test namespaces: ${NAMESPACE_COUNT}" - echo "Diagnostics location: ${MUST_GATHER_DIR}" - echo "==========================================" - -else - echo "[SUCCESS] No remaining test namespaces - all tests passed!" -fi - -# Exit with test result if tests failed -if [ ${TEST_EXIT_CODE} -ne 0 ]; then - echo "[ERROR] Integration tests failed with exit code ${TEST_EXIT_CODE}" - exit ${TEST_EXIT_CODE} -fi - -echo "[SUCCESS] Integration tests completed successfully" - -echo "[INFO] Collecting test results and cluster state..." -kubectl get all -A > /tmp/operator-install-logs/cluster-all-resources.yaml 2>&1 || true -kubectl get nodes -o wide > /tmp/operator-install-logs/nodes.yaml 2>&1 || true - -# Cleanup ssh-agent we started -if [ -n "${SSH_AGENT_PID:-}" ]; then - echo "[INFO] Cleaning up ssh-agent (PID: ${SSH_AGENT_PID})..." - kill "${SSH_AGENT_PID}" 2>/dev/null || echo "[WARN] Could not kill ssh-agent" -else - echo "[WARN] SSH_AGENT_PID not set, cannot cleanup ssh-agent" -fi - -echo "[SUCCESS] Operator integration tests completed" - -EOF -then - log_error "Operator integration tests failed" - CRITICAL_FAILURE=true - DEPLOYMENT_STATUS=1 -fi - -if $CRITICAL_FAILURE; then - log_error "Critical failure during operator integration tests" - - mkdir -p "${ARTIFACT_DIR}/operator-test-logs" - mkdir -p "${ARTIFACT_DIR}/must-gather" - - # Collect test logs - log_info "Collecting test logs from Beaker machine..." - scp "${SSHOPTS[@]}" \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/operator-install-logs/*" \ - "${ARTIFACT_DIR}/operator-test-logs/" 2>&1 || log_warn "Failed to collect test logs" - - # Collect must-gather diagnostics from failed tests - log_info "Collecting must-gather diagnostics from Beaker machine..." - - if ssh "${SSHOPTS[@]}" "${BEAKER_USER}@${BEAKER_IP}" "ls -d /tmp/must-gather-failed-tests-* 2>/dev/null" | grep -q must-gather; then - scp "${SSHOPTS[@]}" -r \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/must-gather-failed-tests-*" \ - "${ARTIFACT_DIR}/must-gather/" 2>&1 || log_warn "Failed to collect diagnostics" - - log_success "Must-gather diagnostics collected to ${ARTIFACT_DIR}/must-gather/" - - # Show summary of collected diagnostics - log_info "Collected diagnostic directories:" - ls -lh "${ARTIFACT_DIR}/must-gather/" 2>/dev/null | tail -n +2 | while read -r line; do - dir_name=$(echo "$line" | awk '{print $9}') - file_count=$(find "${ARTIFACT_DIR}/must-gather/${dir_name}" -type f 2>/dev/null | wc -l) - log_info " ${dir_name}: ${file_count} files" - done - else - log_info "No must-gather diagnostics found (all tests may have passed before failure)" - fi - - exit ${DEPLOYMENT_STATUS} -fi - -log_success "Operator integration tests completed successfully" - -# Collect Test Results and Logs -progress "Collecting test results and logs" - -mkdir -p "${ARTIFACT_DIR}/operator-test-logs" - -scp "${SSHOPTS[@]}" -r \ - "${BEAKER_USER}@${BEAKER_IP}:/tmp/operator-install-logs/*" \ - "${ARTIFACT_DIR}/operator-test-logs/" 2>&1 || log_warn "Failed to collect some test logs" - -log_success "Test results collected to ${ARTIFACT_DIR}/operator-test-logs/" - -# Final Status -echo "" -echo "==========================================" -echo "Operator Integration Tests - Completed Successfully" -echo "==========================================" -echo "Operator Repository: ${OPERATOR_REPO}" -echo "Operator Branch: ${OPERATOR_BRANCH}" -echo "Beaker Machine: ${BEAKER_IP}" -echo "Container Runtime: ${CONTAINER_RUNTIME}" -echo "" -echo "Integration tests passed:" -echo " - cluster-up: Kind cluster created" -echo " - push: Container images built and pushed" -echo " - install-kubevirt: KubeVirt installed" -echo " - integration-tests: All tests passed" -echo "" -echo "Test results: ${ARTIFACT_DIR}/operator-test-logs/" -echo "==========================================" -date diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-ref.metadata.json b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-ref.metadata.json deleted file mode 100644 index c623377a44bde..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-ref.metadata.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "path": "trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-ref.yaml", - "owners": { - "approvers": [ - "alicefr", - "Jakob-Naucke", - "fangge1212", - "yalzhang" - ], - "reviewers": [ - "alicefr", - "yalzhang", - "Jakob-Naucke", - "fangge1212" - ] - } -} \ No newline at end of file diff --git a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-ref.yaml b/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-ref.yaml deleted file mode 100644 index eee4318a7a208..0000000000000 --- a/ci-operator/step-registry/trusted-execution-clusters/ref/operator/test/trusted-execution-clusters-ref-operator-test-ref.yaml +++ /dev/null @@ -1,78 +0,0 @@ -ref: - as: trusted-execution-clusters-ref-operator-test - timeout: 3h0m0s - from: src - commands: trusted-execution-clusters-ref-operator-test-commands.sh - credentials: - - namespace: test-credentials - name: beaker-bm - mount_path: /var/run/beaker-bm - resources: - requests: - cpu: 500m - memory: 500Mi - limits: - memory: 1Gi - env: - - name: BEAKER_IP - default: "" - documentation: |- - IP address of the Beaker provisioned bare metal machine. - If not set, will read from Vault or ${SHARED_DIR}/beaker_ip - - name: BEAKER_USER - default: "root" - documentation: |- - Username for SSH access to the Beaker machine. - If not set, will read from Vault or default to 'root' - - name: POD_READY_TIMEOUT - default: "900" - documentation: |- - Timeout in seconds for waiting for all pods to be ready. - Defaults to 900 seconds (15 minutes). - documentation: |- - This step runs the integration tests for the trusted-execution-clusters operator. - - The step performs the following operations: - 1. Connects to the Beaker machine via SSH using credentials from Vault - 2. For presubmit jobs: Transfers PR code from test pod to Beaker machine - - PR code is already checked out in test pod at /go/src/github.com/trusted-execution-clusters/operator - - Creates tarball and transfers to Beaker - - Extracts to ~/operator-pr-code (separate from infrastructure directory) - 3. For periodic/rehearsal jobs: Uses main branch code from ~/operator-kind-setup - 4. Verifies Kind cluster is running (uses cluster from beaker-kind-provision step) - 5. Builds and pushes container images to local registry (localhost:5000/trusted-execution-clusters) - 6. Installs KubeVirt with 'make install-kubevirt' - 7. Runs integration tests with 'make integration-tests' - - Test images are pulled on-demand during test execution - 8. Collects test results and cluster state - - Directory Structure on Beaker: - - ~/operator-kind-setup: Infrastructure directory (Kind cluster, customized kind/config.yaml with BEAKER_IP) - - ~/operator-pr-code: PR code directory (created for presubmit jobs only) - - This two-directory approach keeps infrastructure config separate from test code, avoiding - the need for complex git operations or conflict resolution on the Beaker machine. - - Prerequisites: - - beaker-kind-provision step must have run (creates cluster in ~/operator-kind-setup) - - SSH credentials must be available in Vault - - Local container registry must be running at localhost:5000 - - Outputs: - - ${ARTIFACT_DIR}/operator-test-logs/: Test logs and diagnostics - - ${ARTIFACT_DIR}/operator-test-logs/installation.log: Complete test execution log - - ${ARTIFACT_DIR}/operator-test-logs/cluster-all-resources.yaml: All cluster resources - - ${ARTIFACT_DIR}/operator-test-logs/nodes.yaml: Node information - - After this step completes: - - Integration tests have been executed against PR code (presubmit) or main branch (periodic) - - Test results are collected for analysis - - Cluster state is available for debugging - - Exit Codes: - - 0: Integration tests passed successfully - - 1: Critical failure (SSH issues, cluster creation failed, tests failed) - - Note: OpenShift CI automatically checks out PR code in the test pod for presubmit jobs. - This step transfers that code to Beaker rather than cloning/fetching from GitHub, - ensuring the exact code being tested matches what CI checked out.