diff --git a/discovery/roles/ome_discovery/tasks/generate_discovery_report.yml b/discovery/roles/ome_discovery/tasks/generate_discovery_report.yml index f003ea40fb..686f5fee2c 100644 --- a/discovery/roles/ome_discovery/tasks/generate_discovery_report.yml +++ b/discovery/roles/ome_discovery/tasks/generate_discovery_report.yml @@ -56,10 +56,7 @@ - "" - "3. Update HOSTNAME, FUNCTIONAL_GROUP_NAME, GROUP_NAME as needed." - "" - - "4. Update the following parameter in provision_config.yml:" - - " pxe_mapping_file_path: {{ pxe_mapping_output_file }}" - - "" - - "5. Run:" + - "4. Run:" - " ansible-playbook provision/provision.yml" - "============================================================" @@ -84,12 +81,12 @@ - "" - "3. Update HOSTNAME, FUNCTIONAL_GROUP_NAME, GROUP_NAME as needed." - "" - - "4. Update the following parameter in provision_config.yml:" - - " pxe_mapping_file_path: {{ pxe_mapping_output_file }}" - - "" - - "5. If using BuildStream, manually copy the PXE mapping file to GitLab:" - - " input/pxe_mapping_file.csv" + - "4. If GitLab server is not yet up, copy the generated file to" + - " /opt/omnia/input/project_default/pxe_mapping_file.csv in the omnia_core container" - "" - - "6. Run:" - - " ansible-playbook provision/provision.yml" + - "5. If the GitLab server is up and running, copy the file to" + - " input/pxe_mapping_file.csv in the GitLab project and commit the changes" + - " after building the images using the build pipeline. Committing the PXE mapping" + - " file will automatically trigger the deploy pipeline and deploy the images on" + - " the nodes listed in the newly committed PXE mapping file." - "============================================================" diff --git a/omnia.sh b/omnia.sh index 353aa9b5ca..21e6f7ce0b 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1657,6 +1657,123 @@ phase2_approval() { return 0 } +# ═══════════════════════════════════════════════════════════════════════════ +# validate_backup_disk_space: Pre-upgrade disk space validation +# Ensures sufficient space exists before backup creation to prevent partial +# backups due to disk full conditions. +# ═══════════════════════════════════════════════════════════════════════════ +validate_backup_disk_space() { + local backup_base="$1" + local safety_multiplier=2 # Require 2× the estimated backup size + + echo "[INFO] [ORCHESTRATOR] Validating disk space for backup..." + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Cannot validate disk space: omnia_core container not running" + return 1 + fi + + # Calculate size of data to be backed up (in KB) + local input_size=0 + local openchami_size=0 + local metadata_size=0 + + # Get input directory size + input_size=$(podman exec -u root omnia_core bash -c " + if [ -d '$CONTAINER_INPUT_DIR' ]; then + du -sk '$CONTAINER_INPUT_DIR' 2>/dev/null | cut -f1 + else + echo 0 + fi + " 2>/dev/null || echo 0) + + # Get OpenCHAMI directory size + openchami_size=$(podman exec -u root omnia_core bash -c " + if [ -d '/opt/omnia/openchami' ]; then + du -sk '/opt/omnia/openchami' 2>/dev/null | cut -f1 + else + echo 0 + fi + " 2>/dev/null || echo 0) + + # Get metadata file size (small, but include for completeness) + metadata_size=$(podman exec -u root omnia_core bash -c " + if [ -f '$CONTAINER_METADATA_FILE' ]; then + du -sk '$CONTAINER_METADATA_FILE' 2>/dev/null | cut -f1 + else + echo 0 + fi + " 2>/dev/null || echo 0) + + # Ensure values are numeric + input_size=${input_size:-0} + openchami_size=${openchami_size:-0} + metadata_size=${metadata_size:-0} + + # Calculate total estimated backup size + local total_backup_size_kb=$((input_size + openchami_size + metadata_size)) + + # Add buffer for quadlet files from host (~100KB typical) + total_backup_size_kb=$((total_backup_size_kb + 100)) + + # Calculate required space with safety multiplier + local required_space_kb=$((total_backup_size_kb * safety_multiplier)) + + # Convert to human-readable for display + local total_backup_size_mb=$((total_backup_size_kb / 1024)) + local required_space_mb=$((required_space_kb / 1024)) + + echo "[INFO] [ORCHESTRATOR] Estimated backup size: ${total_backup_size_mb}MB" + echo "[INFO] [ORCHESTRATOR] Required space (${safety_multiplier}× safety margin): ${required_space_mb}MB" + + # Get available space on backup destination filesystem + # The backup path is inside the container, which maps to the omnia share + local backup_parent_dir + backup_parent_dir=$(dirname "$backup_base") + + local available_space_kb + available_space_kb=$(podman exec -u root omnia_core bash -c " + # Ensure parent directory exists for df check + mkdir -p '$backup_parent_dir' 2>/dev/null || true + df -k '$backup_parent_dir' 2>/dev/null | tail -1 | awk '{print \$4}' + " 2>/dev/null) + + if [ -z "$available_space_kb" ] || ! [[ "$available_space_kb" =~ ^[0-9]+$ ]]; then + echo "[WARN] [ORCHESTRATOR] Could not determine available disk space; proceeding with backup" + return 0 + fi + + local available_space_mb=$((available_space_kb / 1024)) + echo "[INFO] [ORCHESTRATOR] Available space on backup filesystem: ${available_space_mb}MB" + + # Check if sufficient space is available + if [ "$available_space_kb" -lt "$required_space_kb" ]; then + echo "" + echo -e "${RED}═══════════════════════════════════════════════════════════════════════════${NC}" + echo -e "${RED} INSUFFICIENT DISK SPACE FOR BACKUP${NC}" + echo -e "${RED}═══════════════════════════════════════════════════════════════════════════${NC}" + echo "" + echo -e "${RED}ERROR: Not enough disk space to safely create upgrade backup.${NC}" + echo "" + echo -e "${YELLOW}Disk Space Summary:${NC}" + echo -e "${YELLOW} - Estimated backup size: ${total_backup_size_mb}MB${NC}" + echo -e "${YELLOW} - Required space (${safety_multiplier}×): ${required_space_mb}MB${NC}" + echo -e "${YELLOW} - Available space: ${available_space_mb}MB${NC}" + echo -e "${YELLOW} - Shortfall: $((required_space_mb - available_space_mb))MB${NC}" + echo "" + echo -e "${YELLOW}Backup destination: $backup_base${NC}" + echo "" + echo -e "${GREEN}Required Action:${NC}" + echo -e "${GREEN} 1. Free up at least $((required_space_mb - available_space_mb))MB on the Omnia share${NC}" + echo -e "${GREEN} 2. Re-run 'omnia.sh --upgrade' after freeing space${NC}" + echo "" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Disk space validation passed" + return 0 +} + backup_openchami_data() { local backup_base="$1" @@ -1668,11 +1785,14 @@ backup_openchami_data() { return 0 fi - # Create openchami backup directory structure + # Create openchami backup directory structure with secure permissions if ! podman exec -u root omnia_core bash -c " set -e mkdir -p '${backup_base%/}/openchami/openchami_data' + chmod 0700 '${backup_base%/}/openchami' cp -a /opt/omnia/openchami/. '${backup_base%/}/openchami/openchami_data/' 2>&1 + chmod -R 0600 '${backup_base%/}/openchami/openchami_data'/* + find '${backup_base%/}/openchami/openchami_data' -type d -exec chmod 0700 {} \; "; then echo "[WARN] [ORCHESTRATOR] Failed to backup OpenCHAMI data — upgrade will continue" return 0 @@ -1689,19 +1809,33 @@ backup_openchami_data() { if [ -f "/etc/systemd/system/openchami.target" ]; then podman cp "/etc/systemd/system/openchami.target" \ "omnia_core:${backup_base%/}/openchami/openchami.target" >/dev/null 2>&1 || true + podman exec -u root omnia_core chmod 0600 "${backup_base%/}/openchami/openchami.target" 2>/dev/null || true echo "[INFO] [ORCHESTRATOR] openchami.target backed up" fi # Backup quadlet .container files from host (if they exist) if ls /etc/containers/systemd/*.container >/dev/null 2>&1; then - podman exec -u root omnia_core mkdir -p "${backup_base%/}/openchami/quadlets" 2>/dev/null || true + podman exec -u root omnia_core bash -c "mkdir -p '${backup_base%/}/openchami/quadlets' && chmod 0700 '${backup_base%/}/openchami/quadlets'" 2>/dev/null || true for qfile in /etc/containers/systemd/*.container; do podman cp "$qfile" \ "omnia_core:${backup_base%/}/openchami/quadlets/$(basename "$qfile")" >/dev/null 2>&1 || true + podman exec -u root omnia_core chmod 0600 "${backup_base%/}/openchami/quadlets/$(basename "$qfile")" 2>/dev/null || true done echo "[INFO] [ORCHESTRATOR] Quadlet .container files backed up" fi + # Backup quadlet .network files from host (if they exist) + # These define Podman networks that enable DNS resolution between containers + if ls /etc/containers/systemd/*.network >/dev/null 2>&1; then + podman exec -u root omnia_core bash -c "mkdir -p '${backup_base%/}/openchami/quadlets' && chmod 0700 '${backup_base%/}/openchami/quadlets'" 2>/dev/null || true + for nfile in /etc/containers/systemd/*.network; do + podman cp "$nfile" \ + "omnia_core:${backup_base%/}/openchami/quadlets/$(basename "$nfile")" >/dev/null 2>&1 || true + podman exec -u root omnia_core chmod 0600 "${backup_base%/}/openchami/quadlets/$(basename "$nfile")" 2>/dev/null || true + done + echo "[INFO] [ORCHESTRATOR] Quadlet .network files backed up" + fi + echo "[INFO] [ORCHESTRATOR] OpenCHAMI data backup completed: ${backup_base}/openchami/" return 0 } @@ -1725,13 +1859,17 @@ phase3_backup_creation() { set -e rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' mkdir -p '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' + chmod 0700 '${backup_base%/}' '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' if [ -f '$CONTAINER_INPUT_DIR/default.yml' ]; then cp -a '$CONTAINER_INPUT_DIR/default.yml' '${backup_base%/}/input/' + chmod 0600 '${backup_base%/}/input/default.yml' fi if [ -d '$CONTAINER_INPUT_DIR/project_default' ]; then cp -a '$CONTAINER_INPUT_DIR/project_default' '${backup_base%/}/input/' + chmod -R 0600 '${backup_base%/}/input/project_default'/* + find '${backup_base%/}/input/project_default' -type d -exec chmod 0700 {} \; fi if [ ! -f '$CONTAINER_METADATA_FILE' ]; then @@ -1739,6 +1877,7 @@ phase3_backup_creation() { exit 1 fi cp -a '$CONTAINER_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml' + chmod 0600 '${backup_base%/}/metadata/oim_metadata.yml' "; then echo "[ERROR] [ORCHESTRATOR] Backup failed; cleaning up partial backup" podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true @@ -1751,6 +1890,7 @@ phase3_backup_creation() { podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true return 1 fi + podman exec -u root omnia_core chmod 0600 "${backup_base%/}/configs/omnia_core.container" 2>/dev/null || true fi echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_base" @@ -2104,6 +2244,12 @@ upgrade_omnia_core() { exit 1 fi + # Validate disk space before backup creation + if ! validate_backup_disk_space "$backup_base"; then + echo "[ERROR] [ORCHESTRATOR] Upgrade aborted: Insufficient disk space for backup" + exit 1 + fi + if ! phase3_backup_creation "$backup_base"; then echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3" exit 1 @@ -2302,6 +2448,94 @@ rollback_omnia_core() { echo -e "${RED}ERROR: Omnia core container is not running.${NC}" exit 1 fi + + # ═══════════════════════════════════════════════════════════════════════════ + # SAFETY CHECK: Prevent core container rollback if upgrade.yml was run but + # rollback.yml has not completed successfully inside the container. + # This prevents inconsistent state where core is 2.1 but other components are 2.2. + # ═══════════════════════════════════════════════════════════════════════════ + local upgrade_manifest_path="/opt/omnia/.data/upgrade_manifest.yml" + local rollback_manifest_path="/opt/omnia/.data/rollback_manifest.yml" + + # Check if upgrade_manifest.yml exists (indicates upgrade process was started) + if podman exec -u root omnia_core test -f "$upgrade_manifest_path" 2>/dev/null; then + echo "[INFO] [ROLLBACK] Checking upgrade state before proceeding..." + + # Read component statuses from upgrade_manifest.yml + local component_statuses + component_statuses=$(podman exec -u root omnia_core grep -A20 'component_status:' "$upgrade_manifest_path" 2>/dev/null | grep -E '^\s+\w+:' | head -8) + + # Check if any component has been upgraded (status is not "pending") + local has_upgraded_components=false + if echo "$component_statuses" | grep -qvE ':\s*"?pending"?\s*$'; then + has_upgraded_components=true + fi + + if [ "$has_upgraded_components" = true ]; then + echo "[INFO] [ROLLBACK] Detected upgraded components. Checking rollback.yml completion status..." + + # Components have been upgraded - check if rollback.yml completed successfully + if podman exec -u root omnia_core test -f "$rollback_manifest_path" 2>/dev/null; then + local rollback_status + rollback_status=$(podman exec -u root omnia_core grep '^rollback_status:' "$rollback_manifest_path" 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r"') + + if [ "$rollback_status" != "completed" ]; then + echo "" + echo -e "${RED}═══════════════════════════════════════════════════════════════════════════${NC}" + echo -e "${RED} CORE CONTAINER ROLLBACK BLOCKED${NC}" + echo -e "${RED}═══════════════════════════════════════════════════════════════════════════${NC}" + echo "" + echo -e "${RED}ERROR: Cannot rollback core container at this time.${NC}" + echo "" + echo -e "${YELLOW}Reason: upgrade.yml has upgraded components, but rollback.yml has not${NC}" + echo -e "${YELLOW} completed successfully inside the container.${NC}" + echo "" + echo -e "${YELLOW}Current rollback status: ${rollback_status:-'unknown'}${NC}" + echo "" + echo -e "${YELLOW}Rolling back the core container now would leave your cluster in an${NC}" + echo -e "${YELLOW}inconsistent state where:${NC}" + echo -e "${YELLOW} - Core container: 2.1 (rolled back)${NC}" + echo -e "${YELLOW} - Other components: 2.2 (not rolled back)${NC}" + echo "" + echo -e "${GREEN}Required Action:${NC}" + echo -e "${GREEN} 1. First run rollback.yml inside the container to rollback all components${NC}" + echo -e "${GREEN} 2. Wait for rollback.yml to complete successfully${NC}" + echo -e "${GREEN} 3. Then run 'omnia.sh --rollback' to rollback the core container${NC}" + echo "" + exit 1 + fi + echo "[INFO] [ROLLBACK] Rollback playbook completed successfully. Proceeding with core container rollback." + else + # Rollback manifest doesn't exist but components were upgraded + echo "" + echo -e "${RED}═══════════════════════════════════════════════════════════════════════════${NC}" + echo -e "${RED} CORE CONTAINER ROLLBACK BLOCKED${NC}" + echo -e "${RED}═══════════════════════════════════════════════════════════════════════════${NC}" + echo "" + echo -e "${RED}ERROR: Cannot rollback core container at this time.${NC}" + echo "" + echo -e "${YELLOW}Reason: upgrade.yml has upgraded components, but rollback.yml has not${NC}" + echo -e "${YELLOW} been executed inside the container.${NC}" + echo "" + echo -e "${YELLOW}Rolling back the core container now would leave your cluster in an${NC}" + echo -e "${YELLOW}inconsistent state where:${NC}" + echo -e "${YELLOW} - Core container: 2.1 (rolled back)${NC}" + echo -e "${YELLOW} - Other components: 2.2 (not rolled back)${NC}" + echo "" + echo -e "${GREEN}Required Action:${NC}" + echo -e "${GREEN} 1. First run rollback.yml inside the container to rollback all components${NC}" + echo -e "${GREEN} 2. Wait for rollback.yml to complete successfully${NC}" + echo -e "${GREEN} 3. Then run 'omnia.sh --rollback' to rollback the core container${NC}" + echo "" + exit 1 + fi + else + echo "[INFO] [ROLLBACK] No components upgraded yet. Core container rollback is safe to proceed." + fi + else + echo "[INFO] [ROLLBACK] No upgrade manifest found. Core container rollback is safe to proceed." + fi + # ═══════════════════════════════════════════════════════════════════════════ # Create lock file to prevent concurrent rollbacks local lock_file="/tmp/omnia_rollback.lock" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 index 158e089805..a1e768d9bf 100644 --- a/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 @@ -31,18 +31,12 @@ echo "=====================================================" # Detect CUDA major version for DCGM package selection echo "[INFO] Detecting CUDA version for DCGM package compatibility..." # Try to get CUDA version from nvidia-smi -CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) +CUDA_VERSION=$(nvidia-smi | sed -nE 's/.*CUDA( UMD)? Version: *([0-9]+).*/\2/p') -# Fallback: Try to get CUDA version from nvcc if available if [ -z "$CUDA_VERSION" ]; then - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) - echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" - else - echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." - echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + echo "[ERROR] Could not detect CUDA version from nvidia-smi" + echo "[ERROR] CUDA driver is required for DCGM package version detection. Skipping DCGM setup." exit 1 - fi else echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" fi diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 index b0c3dd8b3c..7d56e91d56 100644 --- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 @@ -72,7 +72,7 @@ spec: - ip: "127.0.0.1" hostnames: - "mysqldb" - terminationGracePeriodSeconds: 10 + terminationGracePeriodSeconds: 120 tolerations: - effect: NoExecute key: node.kubernetes.io/not-ready diff --git a/rollback/playbooks/rollback_slurm.yml b/rollback/playbooks/rollback_slurm.yml index 14a67eaa85..abec6bbbe6 100644 --- a/rollback/playbooks/rollback_slurm.yml +++ b/rollback/playbooks/rollback_slurm.yml @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +# ============================================================================ +# Play 1: Pre-flight — manifest gating, BuildStream terminal gate +# ============================================================================ - name: Rollback Slurm feature updates hosts: localhost connection: local @@ -36,66 +40,83 @@ - name: Read rollback_manifest.yml ansible.builtin.include_vars: file: "{{ rollback_manifest_path }}" - name: rollback_manifest + name: manifest - - name: Skip if slurm already rolled back - ansible.builtin.meta: end_play + - name: Initialize slurm_skip + ansible.builtin.set_fact: + slurm_skip: false + + - name: Set slurm_skip when already completed + ansible.builtin.set_fact: + slurm_skip: true when: - - rollback_manifest.component_status[component_name] | default('pending') == 'completed' + - manifest.component_status[component_name] | default('pending') == 'completed' - name: "Mark as skipped — BuildStream terminal gate active (C-24)" ansible.builtin.copy: content: >- - {{ rollback_manifest | combine({ - 'component_status': rollback_manifest.component_status | combine({ + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ component_name: 'skipped' }) }) | to_nice_yaml }} dest: "{{ rollback_manifest_path }}" mode: '0644' + when: + - not slurm_skip + - hostvars['localhost']['build_stream_terminal'] | default(false) | bool + - manifest.component_status.build_stream | default('pending') == 'completed' + + - name: "Set slurm_skip — BuildStream terminal gate active (C-24)" + ansible.builtin.set_fact: + slurm_skip: true when: - hostvars['localhost']['build_stream_terminal'] | default(false) | bool + - manifest.component_status.build_stream | default('pending') == 'completed' - name: "Skip — BuildStream terminal gate active (C-24)" ansible.builtin.meta: end_play when: - hostvars['localhost']['build_stream_terminal'] | default(false) | bool - - name: Set slurm rollback status to in-progress - ansible.builtin.copy: - content: >- - {{ rollback_manifest | combine({ - 'component_status': rollback_manifest.component_status | combine({ - component_name: 'in-progress' - }) - }) | to_nice_yaml }} - dest: "{{ rollback_manifest_path }}" - mode: '0644' + - name: Block when slurm is already completed + when: not slurm_skip + block: + - name: Set slurm rollback status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' - - name: "Display rollback status in-progress — {{ component_name }}" - ansible.builtin.debug: - msg: "[ROLLBACK] Component '{{ component_name }}' — status changed to: in-progress" + - name: "Display rollback status in-progress — {{ component_name }}" + ansible.builtin.debug: + msg: "[ROLLBACK] Component '{{ component_name }}' — status changed to: in-progress" - - name: Check for existing reboot state file - ansible.builtin.stat: - path: /opt/omnia/.data/slurm_rollback_reboot_state.yml - register: _reboot_state_stat + - name: Check for existing reboot state file + ansible.builtin.stat: + path: /opt/omnia/.data/slurm_rollback_reboot_state.yml + register: _reboot_state_stat - - name: Load reboot state from previous run - ansible.builtin.include_vars: - file: /opt/omnia/.data/slurm_rollback_reboot_state.yml - name: _reboot_state - when: _reboot_state_stat.stat.exists | default(false) + - name: Load reboot state from previous run + ansible.builtin.include_vars: + file: /opt/omnia/.data/slurm_rollback_reboot_state.yml + name: _reboot_state + when: _reboot_state_stat.stat.exists | default(false) - - name: Set previously successful reboot list - ansible.builtin.set_fact: - slurm_previously_rebooted: "{{ _reboot_state.successfully_rebooted | default([]) }}" - when: _reboot_state_stat.stat.exists | default(false) + - name: Set previously successful reboot list + ansible.builtin.set_fact: + slurm_previously_rebooted: "{{ _reboot_state.successfully_rebooted | default([]) }}" + when: _reboot_state_stat.stat.exists | default(false) - - name: Initialize previously rebooted list (no prior state) - ansible.builtin.set_fact: - slurm_previously_rebooted: [] - when: not (_reboot_state_stat.stat.exists | default(false)) + - name: Initialize previously rebooted list (no prior state) + ansible.builtin.set_fact: + slurm_previously_rebooted: [] + when: not (_reboot_state_stat.stat.exists | default(false)) - name: Create OIM host group for cloud-init/BSS update ansible.builtin.import_playbook: ../../utils/create_container_group.yml @@ -111,8 +132,7 @@ tasks: - name: Skip if slurm upgrade not needed ansible.builtin.meta: end_play - when: - - hostvars['localhost']['slurm_skip'] | default(false) | bool + when: hostvars['localhost']['slurm_skip'] | default(false) | bool - name: Read rollback_manifest.yml ansible.builtin.include_vars: @@ -142,6 +162,7 @@ }) | to_nice_yaml }} dest: "{{ rollback_manifest_path }}" mode: '0644' + delegate_to: localhost - name: End play ansible.builtin.meta: end_play @@ -249,13 +270,13 @@ tasks: - name: Skip if slurm upgrade not needed ansible.builtin.meta: end_play - when: slurm_skip | default(false) | bool + when: hostvars['localhost']['slurm_skip'] | default(false) | bool - name: Initialize state ansible.builtin.set_fact: node_status: hostname: "{{ inventory_hostname }}" - reboot: false # TODO: rename as reboot_failed + reboot: false ssh: false sinfo: false unreachable: false @@ -356,7 +377,7 @@ tasks: - name: Skip if slurm rollback not needed ansible.builtin.meta: end_play - when: slurm_skip | default(false) | bool + when: hostvars['localhost']['slurm_skip'] | default(false) | bool - name: Set slurm nodes from inventory ansible.builtin.set_fact: diff --git a/rollback/roles/rollback_openchami/tasks/restore_quadlets_and_configs.yml b/rollback/roles/rollback_openchami/tasks/restore_quadlets_and_configs.yml index 5259a56546..092c7b576e 100644 --- a/rollback/roles/rollback_openchami/tasks/restore_quadlets_and_configs.yml +++ b/rollback/roles/rollback_openchami/tasks/restore_quadlets_and_configs.yml @@ -54,7 +54,7 @@ rollback_oim_host_quadlets: >- {{ rollback_oim_host_backup_dir }}/{{ backup_quadlets_subpath }} - - name: List backed-up quadlet files + - name: List backed-up quadlet container files ansible.builtin.find: paths: "{{ rollback_oim_host_quadlets }}" patterns: "*.container" @@ -63,14 +63,43 @@ delegate_facts: true connection: ssh + - name: List backed-up quadlet network files + ansible.builtin.find: + paths: "{{ rollback_oim_host_quadlets }}" + patterns: "*.network" + register: rollback_network_files + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + - name: Display backed-up quadlet files found ansible.builtin.debug: verbosity: 1 msg: >- - Found {{ rollback_quadlet_files.files | length }} quadlet files in backup: - {{ rollback_quadlet_files.files | map(attribute='path') | map('basename') | list }} + Found {{ rollback_quadlet_files.files | length }} container files and + {{ rollback_network_files.files | default([]) | length }} network files in backup: + Containers: {{ rollback_quadlet_files.files | map(attribute='path') | map('basename') | list }} + Networks: {{ rollback_network_files.files | default([]) | map(attribute='path') | map('basename') | list }} + + # Restore network files FIRST - these must exist before containers can use them + - name: Restore each v2.1 network quadlet file to systemd quadlet directory + ansible.builtin.copy: + src: "{{ item.path }}" + dest: "{{ systemd_quadlet_dir }}/{{ item.path | basename }}" + remote_src: true + owner: root + group: root + mode: "{{ file_permissions_644 }}" + loop: "{{ rollback_network_files.files | default([]) }}" + loop_control: + label: "{{ item.path | basename }}" + delegate_to: oim + delegate_facts: true + connection: ssh + when: rollback_network_files.files | default([]) | length > 0 - - name: Restore each v2.1 quadlet file to systemd quadlet directory + - name: Restore each v2.1 container quadlet file to systemd quadlet directory ansible.builtin.copy: src: "{{ item.path }}" dest: "{{ systemd_quadlet_dir }}/{{ item.path | basename }}" @@ -216,7 +245,8 @@ ansible.builtin.debug: msg: - "{{ rollback_messages.restore.quadlets_success }}" - - "Quadlet files restored: {{ rollback_quadlet_files.files | length }}" + - "Container quadlet files restored: {{ rollback_quadlet_files.files | length }}" + - "Network quadlet files restored: {{ rollback_network_files.files | default([]) | length }}" - "v2.2-only quadlets removed: {{ rollback_v22_only_quadlets | join(', ') }}" - "openchami.target: restored from backup (references coresmd.service)" - "/etc/openchami: {{ 'restored from backup' if rollback_etc_openchami_backup_stat.stat.exists | default(false) else 'backup NOT found' }}" diff --git a/rollback/roles/rollback_openchami/tasks/start_v21_containers.yml b/rollback/roles/rollback_openchami/tasks/start_v21_containers.yml index 61e8505fcb..250208cb3a 100644 --- a/rollback/roles/rollback_openchami/tasks/start_v21_containers.yml +++ b/rollback/roles/rollback_openchami/tasks/start_v21_containers.yml @@ -30,6 +30,104 @@ - name: Start v2.1 containers block: + # ── Create Podman networks if they don't exist ────────────────────── + # These networks enable DNS resolution between containers. + # Without these networks, containers fail with "Could not resolve host" errors. + # We create them directly with podman as a fallback if network quadlet + # files weren't in the backup or the network services don't exist. + - name: Ensure ochami-internal Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-internal + register: create_internal_net + changed_when: "'ochami-internal' in create_internal_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + - name: Ensure ochami-jwt-internal Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-jwt-internal + register: create_jwt_net + changed_when: "'ochami-jwt-internal' in create_jwt_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + - name: Ensure ochami-cert-internal Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-cert-internal + register: create_cert_net + changed_when: "'ochami-cert-internal' in create_cert_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + - name: Ensure ochami-external Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-external + register: create_ext_net + changed_when: "'ochami-external' in create_ext_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + - name: Display network creation status + ansible.builtin.debug: + verbosity: 1 + msg: >- + Podman networks ensured: ochami-internal, ochami-jwt-internal, + ochami-cert-internal, ochami-external + + # ── Start Podman network services if they exist ───────────────────── + # These systemd services may exist from the openchami RPM installation + - name: Start OpenCHAMI internal network service + ansible.builtin.systemd: + name: openchami-internal-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Start OpenCHAMI external network service + ansible.builtin.systemd: + name: openchami-external-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Start OpenCHAMI cert internal network service + ansible.builtin.systemd: + name: openchami-cert-internal-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Start OpenCHAMI JWT internal network service + ansible.builtin.systemd: + name: openchami-jwt-internal-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Wait for network services to initialize + ansible.builtin.pause: + seconds: 5 + # ── Start all OpenCHAMI services ──────────────────────────────────── - name: Start openchami.target with v2.1 containers ansible.builtin.systemd: diff --git a/upgrade/playbooks/upgrade_slurm.yml b/upgrade/playbooks/upgrade_slurm.yml index 62375da04c..086ab2edcf 100644 --- a/upgrade/playbooks/upgrade_slurm.yml +++ b/upgrade/playbooks/upgrade_slurm.yml @@ -52,10 +52,6 @@ when: - manifest.component_status[component_name] | default('pending') == 'completed' - - name: Set slurm upgrade directory - ansible.builtin.set_fact: - slurm_2_1_backup_dir: "{{ manifest.backup_dir }}" - - name: "Mark as skipped — BuildStream terminal gate active (C-24)" ansible.builtin.copy: content: >- @@ -78,50 +74,50 @@ - hostvars['localhost']['build_stream_terminal'] | default(false) | bool - manifest.component_status.build_stream | default('pending') == 'completed' - - name: Set slurm upgrade status to in-progress - ansible.builtin.copy: - content: >- - {{ manifest | combine({ - 'component_status': manifest.component_status | combine({ - component_name: 'in-progress' - }) - }) | to_nice_yaml }} - dest: "{{ manifest_path }}" - mode: '0644' - when: not slurm_skip + - name: "Skip — BuildStream terminal gate active (C-24)" + ansible.builtin.meta: end_play + when: + - hostvars['localhost']['build_stream_terminal'] | default(false) | bool - - name: "Display upgrade status in-progress — {{ component_name }}" - ansible.builtin.debug: - msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: in-progress" + - name: Block when slurm is already completed when: not slurm_skip + block: + - name: Set slurm upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' - - name: Check for existing reboot state file - ansible.builtin.stat: - path: /opt/omnia/.data/slurm_upgrade_reboot_state.yml - register: _reboot_state_stat - when: not slurm_skip + - name: "Display upgrade status in-progress — {{ component_name }}" + ansible.builtin.debug: + msg: "[UPGRADE] Component '{{ component_name }}' — status changed to: in-progress" - - name: Load reboot state from previous run - ansible.builtin.include_vars: - file: /opt/omnia/.data/slurm_upgrade_reboot_state.yml - name: _reboot_state - when: - - not slurm_skip - - _reboot_state_stat.stat.exists | default(false) + - name: Check for existing reboot state file + ansible.builtin.stat: + path: /opt/omnia/.data/slurm_upgrade_reboot_state.yml + register: _reboot_state_stat - - name: Set previously successful reboot list - ansible.builtin.set_fact: - slurm_previously_rebooted: "{{ _reboot_state.successfully_rebooted | default([]) }}" - when: - - not slurm_skip - - _reboot_state_stat.stat.exists | default(false) + - name: Load reboot state from previous run + ansible.builtin.include_vars: + file: /opt/omnia/.data/slurm_upgrade_reboot_state.yml + name: _reboot_state + when: + - _reboot_state_stat.stat.exists | default(false) - - name: Initialize previously rebooted list (no prior state) - ansible.builtin.set_fact: - slurm_previously_rebooted: [] - when: - - not slurm_skip - - not (_reboot_state_stat.stat.exists | default(false)) + - name: Set previously successful reboot list + ansible.builtin.set_fact: + slurm_previously_rebooted: "{{ _reboot_state.successfully_rebooted | default([]) }}" + when: _reboot_state_stat.stat.exists | default(false) + + - name: Initialize previously rebooted list (no prior state) + ansible.builtin.set_fact: + slurm_previously_rebooted: [] + when: not (_reboot_state_stat.stat.exists | default(false)) # ============================================================================ # Create OIM host group (needed for cloud-init/BSS update on OIM) @@ -181,6 +177,7 @@ }) | to_nice_yaml }} dest: "{{ manifest_path }}" mode: '0644' + delegate_to: localhost - name: End play ansible.builtin.meta: end_play @@ -216,23 +213,19 @@ when: slurm_host_group_map | default({}) | length == 0 - name: SLURM UPGRADE WARNING - ansible.builtin.debug: - msg: "{{ slurm_upgrade_banner }}" - vars: - slurm_upgrade_banner: - - "[UPGRADE] SLURM CLUSTER — PRE-UPGRADE NOTICE" - - "" - - "1. NODE REBOOT — All Slurm/login nodes will reboot. Ensure no critical jobs are running." - - "2. PXE MAPPING — Do not modify Slurm node entries until upgrade completes." - - "3. NFS MOUNTS — Omnia 2.1 mount points are preserved. Do not modify during upgrade." - - "4. VAST STORAGE — Vast storage is not supported during upgrade. Please remove it from omnia_config.yml." - - "5. ROLLBACK SCOPE — New NFS mounts added during upgrade will NOT be retained on rollback." - - "6. POST-UPGRADE — Rollback is NOT recommended once all nodes boot with cloud-init complete." - - - name: Pause to display warning ansible.builtin.pause: seconds: 10 - prompt: "SLURM UPGRADE - Proceeding in 10 seconds..." + prompt: "{{ slurm_upgrade_banner }}" + vars: + slurm_upgrade_banner: | + [UPGRADE] SLURM CLUSTER — PRE-UPGRADE NOTICE + ============================================ + 1. NODE REBOOT — All Slurm/login nodes will reboot. Ensure no critical jobs are running. + 2. PXE MAPPING — Do not modify Slurm node entries until upgrade completes. + 3. NFS MOUNTS — Omnia 2.1 mount points are preserved. Do not modify during upgrade. + 4. VAST STORAGE — Vast storage is not supported during upgrade. Please remove it from omnia_config.yml. + 5. ROLLBACK SCOPE — New NFS mounts added during upgrade will NOT be retained on rollback. + 6. POST-UPGRADE — Rollback is NOT recommended once all nodes boot with cloud-init complete. - name: Read oim_metadata for oim_node_name (standalone fallback) ansible.builtin.include_vars: @@ -287,13 +280,13 @@ tasks: - name: Skip if slurm upgrade not needed ansible.builtin.meta: end_play - when: slurm_skip | default(false) | bool + when: hostvars['localhost']['slurm_skip'] | default(false) | bool - name: Initialize state ansible.builtin.set_fact: node_status: hostname: "{{ inventory_hostname }}" - reboot: false # TODO: rename as reboot_failed + reboot: false ssh: false sinfo: false unreachable: false @@ -433,7 +426,7 @@ tasks: - name: Skip if slurm upgrade not needed ansible.builtin.meta: end_play - when: slurm_skip | default(false) | bool + when: hostvars['localhost']['slurm_skip'] | default(false) | bool - name: Set slurm nodes from inventory ansible.builtin.set_fact: diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index a78f0de5d3..5bf72a106d 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -142,6 +142,10 @@ ome_password: "{{ credentials_dict.ome_password | default('') }}" ufm_username: "{{ credentials_dict.ufm_username | default('') }}" ufm_password: "{{ credentials_dict.ufm_password | default('') }}" + vast_username: "{{ credentials_dict.vast_username | default('') }}" + vast_password: "{{ credentials_dict.vast_password | default('') }}" + postgres_user: "{{ credentials_dict.postgres_user | default('') }}" + postgres_password: "{{ credentials_dict.postgres_password | default('') }}" no_log: true - name: Write updated content using template diff --git a/upgrade/roles/import_input_parameters/tasks/transform_powerscale_values.yml b/upgrade/roles/import_input_parameters/tasks/transform_powerscale_values.yml index ac2477aecf..d1aeb4e253 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_powerscale_values.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_powerscale_values.yml @@ -121,19 +121,46 @@ | select('ne', '') | first | default('') }} + - name: Fetch PowerScale secret_path from backup omnia_config + ansible.builtin.set_fact: + powerscale_secret_path: >- + {{ backup_omnia_config.service_k8s_cluster + | selectattr('csi_powerscale_driver_secret_file_path', 'defined') + | map(attribute='csi_powerscale_driver_secret_file_path') + | select('ne', '') + | first | default('') }} + - name: Display PowerScale values_path from backup omnia_config ansible.builtin.debug: msg: "PowerScale values_path from backup omnia_config: {{ powerscale_values_path }}" + - name: Display PowerScale secret_path from backup omnia_config + ansible.builtin.debug: + msg: "PowerScale secret_path from backup omnia_config: {{ powerscale_secret_path }}" + - name: Extract values file name from backup omnia_config powerscale_values_path ansible.builtin.set_fact: powerscale_values_filename: "{{ powerscale_values_path | basename | default('values.yaml') }}" when: powerscale_values_path | length > 0 + - name: Extract secret file name from backup omnia_config powerscale_secret_path + ansible.builtin.set_fact: + powerscale_secret_filename: "{{ powerscale_secret_path | basename | default('secret.yaml') }}" + when: powerscale_secret_path | length > 0 + + - name: Set default secret filename if path not configured + ansible.builtin.set_fact: + powerscale_secret_filename: "secret.yaml" + when: powerscale_secret_path | default('') | length == 0 + - name: Display extracted PowerScale values file name ansible.builtin.debug: msg: "PowerScale values file name from backup omnia_config: {{ powerscale_values_filename }}" + - name: Display extracted PowerScale secret file name + ansible.builtin.debug: + msg: "PowerScale secret file name from backup omnia_config: {{ powerscale_secret_filename }}" + - name: Build dynamic GitHub URL for target version values.yaml ansible.builtin.set_fact: powerscale_target_values_url: "{{ powerscale_values_github_url_template | replace('{version}', powerscale_v22_version | regex_replace('^v', '')) }}" @@ -170,12 +197,17 @@ msg: "{{ merge_values_result.stderr_lines | default([]) }}" when: merge_values_result.stderr_lines | default([]) | length > 0 - - name: Copy secret.yaml from v2.1 backup + - name: Check if v2.1 secret file exists in backup + ansible.builtin.stat: + path: "{{ backup_location }}/{{ powerscale_secret_filename }}" + register: v21_secret_stat + + - name: Copy secret file from v2.1 backup ansible.builtin.copy: - src: "{{ backup_location }}/secret.yaml" - dest: "{{ input_project_dir }}/secret.yaml" + src: "{{ backup_location }}/{{ powerscale_secret_filename }}" + dest: "{{ input_project_dir }}/{{ powerscale_secret_filename }}" mode: "0600" - when: v21_values_stat.stat.exists + when: v21_secret_stat.stat.exists - name: Display PowerScale values.yaml transformation summary ansible.builtin.debug: diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 index 9ca40114aa..80699f1cc2 100644 --- a/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 +++ b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 @@ -58,3 +58,7 @@ ome_password: "{{ ome_password | default('') }}" # UFM telemetry credentials ufm_username: "{{ ufm_username | default('') }}" ufm_password: "{{ ufm_password | default('') }}" + +# VAST telemetry credentials +vast_username: "{{ vast_username | default('') }}" +vast_password: "{{ vast_password | default('') }}" diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 3bac72ac30..4423147336 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -353,13 +353,13 @@ msg_powerscale_v22_version_missing: | Please check the software_config.json file. msg_powerscale_values_transform_summary: | PowerScale CSI driver values.yaml transformed: {{ powerscale_v21_version }} to {{ powerscale_v22_version }}. - Backup preserved at: {{ backup_location }}/values.yaml - Target: {{ input_project_dir }}/values.yaml + Backup preserved at: {{ backup_location }}/{{ powerscale_values_filename | default('values.yaml') }} + Target: {{ input_project_dir }}/{{ powerscale_values_filename | default('values.yaml') }} Changes: - Downloaded {{ powerscale_v22_version }} values.yaml template from GitHub - Preserved v2.1 settings: isiPath, isiAccessZone, controllerCount, custom configurations - Updated to {{ powerscale_v22_version }} structure with new parameters - Secret file copied: {{ input_project_dir }}/secret.yaml + Secret file copied: {{ input_project_dir }}/{{ powerscale_secret_filename | default('secret.yaml') }} # PowerScale GitHub URL template for values.yaml powerscale_values_github_url_template: "https://raw.githubusercontent.com/dell/helm-charts/csi-isilon-{version}/charts/csi-isilon/values.yaml" diff --git a/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml index d930690682..33b51ae6b6 100644 --- a/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml +++ b/upgrade/roles/upgrade_k8s/tasks/load_version_vars.yml @@ -126,8 +126,3 @@ | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'helm') | map(attribute='package') | join }} - -# ── Set OIM host ─────────────────────────────────────────────────── -- name: Set oim_host to NFS server IP - ansible.builtin.set_fact: - oim_host: "{{ k8s_nfs_server_ip }}" diff --git a/upgrade/roles/upgrade_openchami/tasks/backup_openchami.yml b/upgrade/roles/upgrade_openchami/tasks/backup_openchami.yml index e646f78066..2e0889e048 100644 --- a/upgrade/roles/upgrade_openchami/tasks/backup_openchami.yml +++ b/upgrade/roles/upgrade_openchami/tasks/backup_openchami.yml @@ -92,13 +92,13 @@ ansible.builtin.file: path: "{{ openchami_backup_dir }}/openchami/postgresql_backup" state: directory - mode: "{{ dir_permissions_755 }}" + mode: "{{ dir_permissions_700 }}" - name: Create PostgreSQL backup directory (OIM host — shared path) ansible.builtin.file: path: "{{ oim_host_backup_dir }}/openchami/postgresql_backup" state: directory - mode: "{{ dir_permissions_755 }}" + mode: "{{ dir_permissions_700 }}" delegate_to: oim delegate_facts: true connection: ssh @@ -181,6 +181,15 @@ delegate_facts: true connection: ssh + - name: Set PostgreSQL backup file permissions to 0600 + ansible.builtin.file: + path: "{{ oim_host_backup_dir }}/openchami/postgresql_backup/openchami.sql" + mode: "0600" + when: pgdump_result.rc == 0 + delegate_to: oim + delegate_facts: true + connection: ssh + - name: Create empty backup marker if pg_dump failed ansible.builtin.copy: content: | @@ -188,7 +197,7 @@ -- Database may be empty (prepare_oim-only scenario) -- stderr: {{ pgdump_result.stderr | default('') | trim }} dest: "{{ openchami_backup_dir }}/openchami/postgresql_backup/openchami.sql" - mode: "{{ file_permissions_644 }}" + mode: "{{ file_permissions_600 }}" when: pgdump_result.rc | default(1) != 0 - name: Display pg_dump warning if it failed @@ -232,7 +241,7 @@ ansible.builtin.file: path: "{{ oim_host_backup_dir }}/{{ backup_etc_openchami_subpath }}" state: directory - mode: "{{ dir_permissions_755 }}" + mode: "{{ dir_permissions_700 }}" delegate_to: oim delegate_facts: true connection: ssh @@ -249,6 +258,8 @@ ansible.builtin.shell: | set -o pipefail cp -a {{ openchami_etc_dir }}/. {{ oim_host_backup_dir }}/{{ backup_etc_openchami_subpath }}/ + chmod -R 0600 {{ oim_host_backup_dir }}/{{ backup_etc_openchami_subpath }}/* + find {{ oim_host_backup_dir }}/{{ backup_etc_openchami_subpath }} -type d -exec chmod 0700 {} \; register: etc_openchami_backup_result changed_when: etc_openchami_backup_result.rc == 0 when: etc_openchami_stat.stat.exists | default(false) @@ -318,7 +329,7 @@ path: "{{ backup_etc_openchami_subpath }}" present: {{ etc_openchami_backup_stat.stat.exists | default(false) }} dest: "{{ openchami_backup_dir }}/openchami_backup_manifest.yml" - mode: "{{ file_permissions_644 }}" + mode: "{{ file_permissions_600 }}" - name: Display backup completion summary ansible.builtin.debug: diff --git a/upgrade/roles/upgrade_openchami/tasks/upgrade_openchami_containers.yml b/upgrade/roles/upgrade_openchami/tasks/upgrade_openchami_containers.yml index e2ff8bd7fa..9430b711ac 100644 --- a/upgrade/roles/upgrade_openchami/tasks/upgrade_openchami_containers.yml +++ b/upgrade/roles/upgrade_openchami/tasks/upgrade_openchami_containers.yml @@ -525,6 +525,93 @@ connection: ssh # --- 9. Start services and recover any failed services --- + # Create Podman networks if they don't exist - these enable DNS resolution + # between containers (e.g., hydra, postgres hostname resolution) + - name: Ensure ochami-internal Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-internal + register: create_internal_net + changed_when: "'ochami-internal' in create_internal_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + - name: Ensure ochami-jwt-internal Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-jwt-internal + register: create_jwt_net + changed_when: "'ochami-jwt-internal' in create_jwt_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + - name: Ensure ochami-cert-internal Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-cert-internal + register: create_cert_net + changed_when: "'ochami-cert-internal' in create_cert_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + - name: Ensure ochami-external Podman network exists + ansible.builtin.command: > + podman network create --ignore ochami-external + register: create_ext_net + changed_when: "'ochami-external' in create_ext_net.stdout" + failed_when: false + delegate_to: oim + delegate_facts: true + connection: ssh + + # Start Podman network services if they exist (from openchami RPM) + - name: Start OpenCHAMI internal network service + ansible.builtin.systemd: + name: openchami-internal-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Start OpenCHAMI external network service + ansible.builtin.systemd: + name: openchami-external-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Start OpenCHAMI cert internal network service + ansible.builtin.systemd: + name: openchami-cert-internal-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Start OpenCHAMI JWT internal network service + ansible.builtin.systemd: + name: openchami-jwt-internal-network.service + state: started + enabled: true + delegate_to: oim + delegate_facts: true + connection: ssh + failed_when: false + + - name: Wait for network services to initialize + ansible.builtin.pause: + seconds: 5 + - name: Start OpenCHAMI services with new images ansible.builtin.systemd: name: openchami.target diff --git a/upgrade/roles/upgrade_openchami/vars/main.yml b/upgrade/roles/upgrade_openchami/vars/main.yml index 5f0b9bca95..b122cae2de 100644 --- a/upgrade/roles/upgrade_openchami/vars/main.yml +++ b/upgrade/roles/upgrade_openchami/vars/main.yml @@ -14,7 +14,9 @@ --- # File permissions +dir_permissions_700: "0700" dir_permissions_755: "0755" +file_permissions_600: "0600" file_permissions_644: "0644" # Manifest path for upgrade state tracking diff --git a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml index d051385353..9655f55130 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/include_required_input.yml @@ -60,6 +60,7 @@ when: - omnia_config is defined - omnia_config.service_k8s_cluster is defined + - omnia_config.service_k8s_cluster | length > 0 tags: always - name: Set k8s_client_mount_path @@ -70,13 +71,22 @@ | first).mount_point }} when: - storage_config is defined + - storage_config.mounts is defined - k8s_nfs_storage_name is defined + - storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | length > 0 tags: always + # ── Load high_availability_config.yml ── +- name: Check if high_availability_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/high_availability_config.yml" + register: ha_config_stat + - name: Read high_availability_config.yml for kube_vip ansible.builtin.include_vars: file: "{{ input_project_dir }}/high_availability_config.yml" name: ha_config + when: ha_config_stat.stat.exists - name: Debug high_availability_config.yml content ansible.builtin.debug: @@ -90,6 +100,7 @@ kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" cacheable: true when: + - ha_config is defined - ha_config.service_k8s_cluster_ha is defined - ha_config.service_k8s_cluster_ha | length > 0 diff --git a/upgrade/roles/upgrade_telemetry/tasks/main.yml b/upgrade/roles/upgrade_telemetry/tasks/main.yml index ee5fd1d282..68c087306c 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/main.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/main.yml @@ -54,9 +54,21 @@ # ── Phase 3: Execute telemetry.sh to redeploy telemetry stack ── - name: Phase 3 - Execute telemetry.sh to redeploy telemetry stack ansible.builtin.include_tasks: execute_telemetry_sh.yml + when: + - k8s_client_mount_path is defined + - kube_vip is defined + - kube_vip | length > 0 + +- name: Skip telemetry.sh (k8s not configured) + ansible.builtin.debug: + msg: "Skipping telemetry.sh execution — service_k8s not configured (Slurm-only deployment)." + when: k8s_client_mount_path is not defined or kube_vip is not defined # ── Phase 4: Verify all telemetry pods and set upgrade status ── - name: Phase 4 - Verify all telemetry pods and set upgrade status + when: + - kube_vip is defined + - kube_vip | length > 0 block: - name: Get all telemetry pods status ansible.builtin.shell: diff --git a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml index 847ad36af4..e99d7bf80c 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/migrate_statefulset.yml @@ -168,3 +168,43 @@ when: orphaned_pods.stdout_lines | default([]) | length > 0 delegate_to: "{{ kube_vip }}" connection: ssh + + # ── Cleanup old pre-operator services and deployments ── + # The operator creates new services with different names (e.g. vminsert-victoria-cluster), + # so the old standalone services become stale and waste LoadBalancer IPs. + - name: Find old pre-operator services + ansible.builtin.shell: | + set -o pipefail + kubectl -n {{ telemetry_namespace }} get svc --no-headers 2>/dev/null \ + | awk '{print $1}' \ + | grep -xE 'vminsert|vmselect|vmstorage|vmagent' || true + register: old_services + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Delete old pre-operator services + ansible.builtin.command: + cmd: kubectl -n {{ telemetry_namespace }} delete svc {{ item }} --timeout=30s + loop: "{{ old_services.stdout_lines | default([]) | select() | list }}" + changed_when: true + failed_when: false + when: old_services.stdout_lines | default([]) | select() | list | length > 0 + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Delete old vmagent deployment (replaced by operator-managed VMAgent) + ansible.builtin.shell: | + kubectl -n {{ telemetry_namespace }} get deployment {{ old_vmagent_deployment }} --no-headers 2>/dev/null && \ + kubectl -n {{ telemetry_namespace }} delete deployment {{ old_vmagent_deployment }} --timeout=60s || true + changed_when: true + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + + - name: Display old resource cleanup summary + ansible.builtin.debug: + msg: + - "Old services deleted: {{ old_services.stdout_lines | default([]) | select() | list }}" + - "Old vmagent deployment cleanup attempted: {{ old_vmagent_deployment }}" diff --git a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml index 6116afb37f..76755b45b9 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/patch_idrac_termination_grace_period.yml @@ -48,7 +48,7 @@ msg: "idrac-telemetry current replica count: {{ idrac_replica_count.stdout }}" when: idrac_sts_check.rc == 0 -- name: Patch terminationGracePeriodSeconds to 120s for graceful MySQL shutdown +- name: Patch terminationGracePeriodSeconds for graceful MySQL shutdown ansible.builtin.command: cmd: > kubectl patch statefulset idrac-telemetry -n {{ telemetry_namespace }} diff --git a/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml b/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml index 4fa40ba520..40cd32336a 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/upgrade_operator.yml @@ -16,15 +16,29 @@ # Install / upgrade VictoriaMetrics operator via Helm # ============================================================================ +- name: Remove finalizers from VictoriaMetrics CRDs (prevents delete hang) + ansible.builtin.shell: | + set -o pipefail + for crd in $(kubectl get crd 2>/dev/null | grep victoriametrics | awk '{print $1}'); do + kubectl patch crd "$crd" --type=merge -p '{"metadata":{"finalizers":[]}}' 2>/dev/null || true + done + changed_when: false + failed_when: false + delegate_to: "{{ kube_vip }}" + connection: ssh + - name: Delete existing VictoriaMetrics CRDs (to fix Helm ownership issues) ansible.builtin.shell: | set -o pipefail - kubectl get crd | grep victoriametrics | awk '{print $1}' | xargs kubectl delete crd 2>/dev/null || true + for crd in $(kubectl get crd 2>/dev/null | grep victoriametrics | awk '{print $1}'); do + timeout 30 kubectl delete crd "$crd" --timeout=30s 2>/dev/null || true + done register: crd_delete_result changed_when: true failed_when: false delegate_to: "{{ kube_vip }}" connection: ssh + timeout: 120 - name: Install VictoriaMetrics operator from tarball ansible.builtin.command: diff --git a/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 b/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 index d9ffd938b4..f1ae1091f6 100644 --- a/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 +++ b/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 @@ -58,3 +58,7 @@ ome_password: "" # UFM telemetry credentials ufm_username: "" ufm_password: "" + +# VAST telemetry credentials +vast_username: "" +vast_password: ""