diff --git a/build_image_aarch64/build_image_aarch64.yml b/build_image_aarch64/build_image_aarch64.yml index 08ee0b4ad8..d5dc76a82d 100644 --- a/build_image_aarch64/build_image_aarch64.yml +++ b/build_image_aarch64/build_image_aarch64.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml b/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml index e5bc523294..40c6b1092c 100644 --- a/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml +++ b/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml @@ -24,9 +24,9 @@ software_config_path: "{{ software_config_file_path }}" register: base_image_output - - name: Set x86_64_base_image_packages + - name: Set aarch_64_base_image_packages ansible.builtin.set_fact: - x86_64_base_image_packages: "{{ base_image_output.base_image_packages }}" + aarch64_base_image_packages: "{{ base_image_output.base_image_packages }}" - name: Debug package aarch64_base_image_packages ansible.builtin.debug: diff --git a/build_image_aarch64/roles/image_creation/vars/main.yml b/build_image_aarch64/roles/image_creation/vars/main.yml index 67d11422ef..984f2497d8 100644 --- a/build_image_aarch64/roles/image_creation/vars/main.yml +++ b/build_image_aarch64/roles/image_creation/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml" dir_permissions_644: "0644" dir_permissions_755: "0755" +aarch64_local_tag: "aarch64-image-builder/ochami" openchami_dir: "/opt/omnia/openchami" openchami_clone_path: /opt/omnia/openchami/deployment-recipes job_retry: "120" @@ -32,7 +33,7 @@ ochami_compute_mounts: - -v {{ openchami_work_dir }}/images/rhel-{{ item.key }}-{{ rhel_tag }}.yaml:/home/builder/config.yaml:z ochami_aarch64_image: - --entrypoint /bin/bash - - localhost/arm-image/ochami + - "localhost/{{ aarch64_local_tag }}" ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml index 1801448611..4a9d150850 100644 --- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml @@ -167,32 +167,42 @@ - name: Build full Podman image path ansible.builtin.set_fact: - pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.1" - -- name: Pull aarch64 image using Podman - ansible.builtin.command: - cmd: "podman pull {{ pulp_aarch_image }}" - register: podman_pull_result - ignore_errors: true - changed_when: false + pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/{{ pulp_aarch64_image_name }}" + +- name: Pull and tag aarch64 image + block: + - name: Pull aarch64 image using Podman + containers.podman.podman_image: + name: "{{ pulp_aarch_image }}" + state: present + register: podman_pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: podman_pull_result is not failed + changed_when: false + + - name: Tag pulled image + containers.podman.podman_tag: + image: "{{ pulp_aarch_image }}" + target_names: + - "{{ aarch64_local_tag }}" + changed_when: false + + rescue: + - name: Fail if Podman pull failed + ansible.builtin.fail: + msg: "Failed to pull image {{ pulp_aarch_image }}" + +- name: Check if regctl binary exists + ansible.builtin.stat: + path: "{{ ochami_aarch_64_dir }}/regctl" + register: regctl_stat + delegate_to: localhost -- name: Fail if Podman pull failed +- name: Fail if regctl binary not found ansible.builtin.fail: - msg: "{{ aarch64_image_fail_msg }}" - when: podman_pull_result.rc != 0 - -- name: Tag pulled image - ansible.builtin.command: - cmd: "podman tag {{ pulp_aarch_image }} arm-image/ochami" - when: podman_pull_result.rc == 0 - changed_when: false - -- name: Download regctl binary to NFS shared path - ansible.builtin.get_url: - url: "{{ aarch64_regctl_url }}" - dest: "{{ ochami_aarch_64_dir }}/regctl" - mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" - delegate_to: localhost + msg: "{{ regctl_not_found_msg }}" + when: not regctl_stat.stat.exists - name: Copy regctl binary to /usr/local/bin on target host ansible.builtin.copy: diff --git a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml index d240f27de4..c0ce2868aa 100644 --- a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,13 @@ # input files input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" +pulp_aarch64_image_name: "dellhpcomniaaisolution/image-build-aarch64:1.1" +aarch64_local_tag: "aarch64-image-builder/ochami" +pull_image_retries: "3" +pull_image_delay: "10" network_spec: "{{ input_project_dir }}/network_spec.yml" ochami_aarch_64_dir: "/opt/omnia/openchami/aarch64" pulp_repo_store_path: "{{ ochami_aarch_64_dir }}/pulp.repo" -aarch64_regctl_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" pulp_repo_file_path: "/etc/yum.repos.d/pulp.repo" pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" @@ -39,3 +42,6 @@ aarch64_image_fail_msg: > Unable to pull the Ochami aarch64 image builder image. Make sure you have added the default package for aarch64 in the software_config.json file and ran local_repo.yml. If not, add that package and rerun local_repo.yml. +regctl_not_found_msg: > + regctl binary not found at {{ ochami_aarch_64_dir }}/regctl. + Please run prepare_oim.yml playbook to download the regctl binary. diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml index 85ecaf93cd..8f56b86ef6 100644 --- a/build_image_x86_64/build_image_x86_64.yml +++ b/build_image_x86_64/build_image_x86_64.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local @@ -80,7 +83,7 @@ - name: Tag OpenCHAMI image ansible.builtin.include_role: name: image_creation - tasks_from: build_image_tag.yml + tasks_from: prepare_pulp_image.yml - name: OpenCHAMI build image for x86_64 hosts: localhost diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml deleted file mode 100644 index 0b7a56072d..0000000000 --- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Pull image-build image - ansible.builtin.command: - cmd: "podman pull {{ image_build_el10 }}" - register: pull_result - retries: "{{ pull_image_retries }}" - delay: "{{ pull_image_delay }}" - until: pull_result.rc == 0 - changed_when: "'Image is up to date' not in pull_result.stdout" - -- name: Fail if image not pulled successfully - ansible.builtin.fail: - msg: "{{ pull_result.stdout }}" - when: pull_result.rc != 0 diff --git a/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml new file mode 100644 index 0000000000..22f336b849 --- /dev/null +++ b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml @@ -0,0 +1,79 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Load network specification +- name: Load network spec file + ansible.builtin.include_vars: + file: "{{ network_spec }}" + register: include_network_spec + no_log: true + +- name: Fail if network spec cannot be loaded + ansible.builtin.fail: + msg: "{{ network_spec_syntax_fail_msg }} Error: {{ include_network_spec.message }}" + when: include_network_spec is failed + +# Parse network spec data +- name: Parse network spec + ansible.builtin.set_fact: + network_data: "{{ network_data | default({}) | combine({item.key: item.value}) }}" + with_dict: "{{ Networks }}" + +# Set PXE IP fact +- name: Set PXE IP fact + ansible.builtin.set_fact: + oim_pxe_ip: "{{ network_data.admin_network.primary_oim_admin_ip }}" + cacheable: true + +# Copy pulp certificate and update CA trust +- name: Copy pulp webserver certificate to anchors + ansible.builtin.copy: + src: "{{ pulp_webserver_cert_path }}" + dest: "{{ anchors_path }}" + mode: "{{ dir_permissions_644 }}" + become: true + +- name: Update CA trust + ansible.builtin.command: update-ca-trust + register: update_ca + changed_when: false + +- name: Build full Podman image path for x86_64 + ansible.builtin.set_fact: + pulp_x86_image: "{{ oim_pxe_ip }}:2225/{{ pulp_x86_64_image_name }}" + +- name: Pull and tag x86_64 image + block: + - name: Pull x86_64 image using Podman + containers.podman.podman_image: + name: "{{ pulp_x86_image }}" + state: present + register: pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pull_result is not failed + changed_when: false + + - name: Tag pulled image for x86_64 build + containers.podman.podman_tag: + image: "{{ pulp_x86_image }}" + target_names: + - "{{ x86_64_local_tag }}" + changed_when: false + + rescue: + - name: Fail if Podman pull failed + ansible.builtin.fail: + msg: "Failed to pull image {{ pulp_x86_image }}." diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml index a05a39d37d..60dcf0bc6f 100644 --- a/build_image_x86_64/roles/image_creation/vars/main.yml +++ b/build_image_x86_64/roles/image_creation/vars/main.yml @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0" +pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.0" +x86_64_local_tag: "x86_64-image-builder/ochami" pull_image_retries: "3" pull_image_delay: "10" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" @@ -23,6 +24,9 @@ openchami_dir: "/opt/omnia/openchami" openchami_clone_path: /opt/omnia/openchami/deployment-recipes job_retry: "120" job_delay: "30" +network_spec: "{{ input_project_dir }}/network_spec.yml" +pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" +anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" ochami_mounts: - --user 0 --privileged @@ -35,7 +39,7 @@ ochami_compute_mounts: ochami_x86_64_image: - --entrypoint /bin/bash - - docker.io/dellhpcomniaaisolution/image-build-el10:1.0 + - "localhost/{{ x86_64_local_tag }}" ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' @@ -54,3 +58,5 @@ compute_image_failure_msg: | # build_compute_image.yml openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2" openchami_compute_image_vars_path: "/opt/omnia/openchami/compute_images_template.yaml" + +network_spec_syntax_fail_msg: "Failed to load network_spec.yml due to syntax error" diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 4de8aafa88..0f369f3950 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -26,8 +26,12 @@ # log path for input validator INPUT_VALIDATOR_LOG_PATH = '/opt/omnia/log/core/playbooks/' -ENTITLEMENT_PEM = '/opt/omnia/rhel_repo_certs/*.pem' -REDHAT_REPO_FILE = '/opt/omnia/rhel_repo_certs/redhat.repo' +# Subscription checking paths - checked in order of priority +SYSTEM_ENTITLEMENT_PATH = '/etc/pki/entitlement/*.pem' +SYSTEM_REDHAT_REPO = '/etc/yum.repos.d/redhat.repo' + +OMNIA_ENTITLEMENT_PATH = '/opt/omnia/rhel_repo_certs/*.pem' +OMNIA_REDHAT_REPO = '/opt/omnia/rhel_repo_certs/redhat.repo' # dict to hold the file names. If any file's name changes just change it here. files = { @@ -76,6 +80,7 @@ "storage": [files["storage_config"]], "prepare_oim": [ files["network_spec"], + files["software_config"] ], # "high_availability": [files["high_availability_config"]], # "additional_software": [files["additional_software"]], @@ -141,6 +146,8 @@ TYPE_REQUIREMENTS = { "rpm": ["package", "repo_name"], "rpm_list": ["package_list", "repo_name"], + "rpm_file": ["package", "url"], + "rpm_repo": ["package", "repo_name"], "ansible_galaxy_collection": ["package", "version"], "git": ["package", "version", "url"], "image": ["package", ["tag", "digest"]], # Special: one of tag or digest diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 0e59272815..a8c50266a0 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -14,7 +14,9 @@ # These are the slurm options for version - 25.11 import re +import os from enum import Enum +from collections import OrderedDict class SlurmParserEnum(str, Enum): @@ -59,13 +61,15 @@ class SlurmParserEnum(str, Enum): S_P_LIST = SlurmParserEnum.S_P_LIST -downnodes_options = { +slurm_downnodes_options = { + "DownNodes": S_P_STRING, "Reason": S_P_STRING, "State": S_P_STRING, } -nodename_options = { +slurm_nodename_options = { + "NodeName": S_P_STRING, "BcastAddr": S_P_STRING, "Boards": S_P_UINT16, "CoreSpecCount": S_P_UINT16, @@ -97,13 +101,15 @@ class SlurmParserEnum(str, Enum): } -nodeset_options = { +slurm_nodeset_options = { + "NodeSet": S_P_STRING, "Feature": S_P_STRING, "Nodes": S_P_STRING } -partition_options = { +slurm_partitionname_options = { + "PartitionName": S_P_STRING, "AllocNodes": S_P_CSV, "AllowAccounts": S_P_CSV, "AllowGroups": S_P_CSV, @@ -152,7 +158,8 @@ class SlurmParserEnum(str, Enum): "TRESBillingWeights": S_P_CSV } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/common/read_config.c +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/common/read_config.c slurm_options = { "AccountingStorageBackupHost": S_P_STRING, "AccountingStorageEnforce": S_P_CSV, @@ -397,7 +404,8 @@ class SlurmParserEnum(str, Enum): "SlurmctldHost": S_P_LIST } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/slurmdbd/read_config.c +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/slurmdbd/read_config.c slurmdbd_options = { "AllowNoDefAcct": S_P_BOOLEAN, "AllResourcesAbsolute": S_P_BOOLEAN, @@ -468,7 +476,8 @@ class SlurmParserEnum(str, Enum): "TrackSlurmctldDown": S_P_BOOLEAN } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/interfaces/cgroup.c#L332 +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/interfaces/cgroup.c#L332 cgroup_options = { "CgroupAutomount": S_P_BOOLEAN, "CgroupMountpoint": S_P_STRING, @@ -495,7 +504,41 @@ class SlurmParserEnum(str, Enum): "SystemdTimeout": S_P_UINT64 } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/plugins/mpi/pmix/mpi_pmix.c#L83 +# From +# https://github.com/SchedMD/slurm/blob/slurm-s/src/interfaces/gres.c#L101C40-L116C2 +_gres_options = { + "AutoDetect": S_P_STRING, + "Count": S_P_STRING, # Number of Gres available + "CPUs": S_P_STRING, # CPUs to bind to Gres resource + "Cores": S_P_CSV, # Cores to bind to Gres resource + "File": S_P_STRING, # Path to Gres device + "Files": S_P_STRING, # Path to Gres device + "Flags": S_P_STRING, # GRES Flags + "Link": S_P_STRING, # Communication link IDs + "Links": S_P_CSV, # Communication link IDs + "MultipleFiles": S_P_CSV, # list of GRES device files + "Type": S_P_STRING +} + +gres_options = _gres_options.copy() +gres_options.update({ + "Name": S_P_ARRAY, + "NodeName": S_P_ARRAY +}) + +gres_nodename_options = _gres_options.copy() +gres_nodename_options.update({ + "NodeName": S_P_STRING, + "Name": S_P_STRING +}) + +gres_name_options = _gres_options.copy() +gres_name_options.update({ + "Name": S_P_STRING +}) + +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/plugins/mpi/pmix/mpi_pmix.c#L83 mpi_options = { "PMIxCliTmpDirBase": S_P_STRING, "PMIxCollFence": S_P_STRING, @@ -512,20 +555,177 @@ class SlurmParserEnum(str, Enum): "PMIxTlsUCX": S_P_CSV } -# From https://github.com/SchedMD/slurm/blob/slurm-s/src/interfaces/gres.c#L101C40-L116C2 -gres_options = { - "AutoDetect": S_P_STRING, - "Count": S_P_STRING, # Number of Gres available - "CPUs": S_P_STRING, # CPUs to bind to Gres resource - "Cores": S_P_CSV, # Cores to bind to Gres resource - "File": S_P_STRING, # Path to Gres device - "Files": S_P_STRING, # Path to Gres device - "Flags": S_P_STRING, # GRES Flags - "Link": S_P_STRING, # Communication link IDs - "Links": S_P_CSV, # Communication link IDs - "MultipleFiles": S_P_CSV, # list of GRES device files - "Name": S_P_STRING, # Gres name - "Type": S_P_STRING # Gres type (e.g. model name) +# src/common/oci_config.c +oci_options = { + "ContainerPath": S_P_STRING, + "CreateEnvFile": S_P_STRING, + "DisableHooks": S_P_STRING, + "EnvExclude": S_P_STRING, + "MountSpoolDir": S_P_STRING, + "RunTimeCreate": S_P_STRING, + "RunTimeDelete": S_P_STRING, + "RunTimeKill": S_P_STRING, + "RunTimeEnvExclude": S_P_STRING, + "RunTimeQuery": S_P_STRING, + "RunTimeRun": S_P_STRING, + "RunTimeStart": S_P_STRING, + "SrunPath": S_P_STRING, + "SrunArgs": S_P_LIST, + "DisableCleanup": S_P_BOOLEAN, + "StdIODebug": S_P_STRING, + "SyslogDebug": S_P_STRING, + "FileDebug": S_P_STRING, + "DebugFlags": S_P_STRING, + "IgnoreFileConfigJson": S_P_BOOLEAN +} + +# From +# src/plugins/acct_gather_*/* +acct_gather_options = { + "EnergyIPMIDriverType": S_P_UINT32, + "EnergyIPMIDisableAutoProbe": S_P_UINT32, + "EnergyIPMIDriverAddress": S_P_UINT32, + "EnergyIPMIRegisterSpacing": S_P_UINT32, + "EnergyIPMIDriverDevice": S_P_STRING, + "EnergyIPMIProtocolVersion": S_P_UINT32, + "EnergyIPMIUsername": S_P_STRING, + "EnergyIPMIPassword": S_P_STRING, + "EnergyIPMIPrivilegeLevel": S_P_UINT32, + "EnergyIPMIAuthenticationType": S_P_UINT32, + "EnergyIPMICipherSuiteId": S_P_UINT32, + "EnergyIPMISessionTimeout": S_P_UINT32, + "EnergyIPMIRetransmissionTimeout": S_P_UINT32, + "EnergyIPMIWorkaroundFlags": S_P_UINT32, + "EnergyIPMIRereadSdrCache": S_P_BOOLEAN, + "EnergyIPMIIgnoreNonInterpretableSensors": S_P_BOOLEAN, + "EnergyIPMIBridgeSensors": S_P_BOOLEAN, + "EnergyIPMIInterpretOemData": S_P_BOOLEAN, + "EnergyIPMISharedSensors": S_P_BOOLEAN, + "EnergyIPMIDiscreteReading": S_P_BOOLEAN, + "EnergyIPMIIgnoreScanningDisabled": S_P_BOOLEAN, + "EnergyIPMIAssumeBmcOwner": S_P_BOOLEAN, + "EnergyIPMIEntitySensorNames": S_P_BOOLEAN, + "EnergyIPMIFrequency": S_P_UINT32, + "EnergyIPMICalcAdjustment": S_P_BOOLEAN, + "EnergyIPMIPowerSensors": S_P_STRING, + "EnergyIPMITimeout": S_P_UINT32, + "EnergyIPMIVariable": S_P_STRING, + "ProfileHDF5Dir": S_P_STRING, + "ProfileHDF5Default": S_P_STRING, + "ProfileInfluxDBDatabase": S_P_STRING, + "ProfileInfluxDBDefault": S_P_STRING, + "ProfileInfluxDBFrequency": S_P_UINT32, + "ProfileInfluxDBHost": S_P_STRING, + "ProfileInfluxDBPass": S_P_STRING, + "ProfileInfluxDBRTPolicy": S_P_STRING, + "ProfileInfluxDBTimeout": S_P_UINT32, + "ProfileInfluxDBUser": S_P_STRING, + "InterconnectOFEDPort": S_P_UINT32, + "InfinibandOFEDPort": S_P_UINT32, + "SysfsInterfaces": S_P_STRING +} + +# src/plugins/burst_buffer/common/burst_buffer_common.c +burst_buffer_options = { + "AllowUsers": S_P_STRING, + "CreateBuffer": S_P_STRING, + "DefaultPool": S_P_STRING, + "DenyUsers": S_P_STRING, + "DestroyBuffer": S_P_STRING, + "Directive": S_P_STRING, + "Flags": S_P_STRING, + "GetSysState": S_P_STRING, + "GetSysStatus": S_P_STRING, + "Granularity": S_P_STRING, + "OtherTimeout": S_P_UINT32, + "PollInterval": S_P_UINT32, + "Pools": S_P_STRING, + "StageInTimeout": S_P_UINT32, + "StageOutTimeout": S_P_UINT32, + "StartStageIn": S_P_STRING, + "StartStageOut": S_P_STRING, + "StopStageIn": S_P_STRING, + "StopStageOut": S_P_STRING, + "ValidateTimeout": S_P_UINT32 +} + +# src/plugins/node_features/helpers/node_features_helpers.c +helpers_options = { + "AllowUserBoot": S_P_STRING, + "BootTime": S_P_UINT32, + "ExecTime": S_P_UINT32, + "Feature": S_P_ARRAY, + "MutuallyExclusive": S_P_LIST, + "NodeName": S_P_ARRAY +} + +helpers_nodename_options = { + "AllowUserBoot": S_P_STRING, + "BootTime": S_P_UINT32, + "ExecTime": S_P_UINT32, + "Feature": S_P_CSV, + "MutuallyExclusive": S_P_LIST +} + +helpers_feature_options = { + "Feature": S_P_CSV, + "Helper": S_P_STRING, + "Flags": S_P_STRING +} + +# src/plugins/namespace/tmpfs/read_jcconf.c +job_container_options = { + "AutoBasePath": S_P_BOOLEAN, + "InitScript": S_P_STRING, + "BasePath": S_P_ARRAY, + "EntireStepInNS": S_P_BOOLEAN, + "NodeName": S_P_ARRAY, + "Shared": S_P_BOOLEAN, + "CloneNSScript": S_P_STRING, + "CloneNSEpilog": S_P_STRING, + "CloneNSScript_Wait": S_P_UINT32, + "CloneNSEpilog_Wait": S_P_UINT32 +} + +job_container_nodename_options = { + "AutoBasePath": S_P_BOOLEAN, + "BasePath": S_P_STRING, + "Dirs": S_P_STRING, + "EntireStepInNS": S_P_BOOLEAN, + "NodeName": S_P_STRING, + "Shared": S_P_BOOLEAN, + "CloneNSScript": S_P_STRING, + "CloneNSEpilog": S_P_STRING, + "CloneNSScript_Wait": S_P_UINT32, + "CloneNSEpilog_Wait": S_P_UINT32 +} + +job_container_basename_options = { + "BasePath": S_P_STRING, + "Dirs": S_P_STRING +} + +# src/plugins/topology/tree/switch_record.c +topology_options = { + "SwitchName": S_P_ARRAY, + "LinkSpeed": S_P_UINT32, + "Nodes": S_P_STRING, + "Switches": S_P_STRING, + "BlockName": S_P_ARRAY, + "BlockSizes": S_P_STRING +} + +topology_switchname_options = { + "SwitchName": S_P_STRING, + "LinkSpeed": S_P_UINT32, + "Nodes": S_P_STRING, + "Switches": S_P_STRING +} + +topology_blockname_options = { + "BlockName": S_P_STRING, + "BlockSizes": S_P_STRING, + "Nodes": S_P_STRING } all_confs = { @@ -533,19 +733,160 @@ class SlurmParserEnum(str, Enum): "slurmdbd": slurmdbd_options, "cgroup": cgroup_options, "mpi": mpi_options, + "oci": oci_options, + "acct_gather": acct_gather_options, + "burst_buffer": burst_buffer_options, + "helpers": helpers_options, + "job_container": job_container_options, + "topology": topology_options, "gres": gres_options, # TOD: GRES can have different combinations, NodeName and Name # https://slurm.schedmd.com/gres.conf.html#SECTION_EXAMPLES - "PartitionName": partition_options, - "NodeName": nodename_options, - "DownNodes": downnodes_options, - "NodeSet": nodeset_options + "slurm->PartitionName": slurm_partitionname_options, + "slurm->NodeName": slurm_nodename_options, + "slurm->DownNodes": slurm_downnodes_options, + "slurm->NodeSet": slurm_nodeset_options, + "gres->Name": gres_name_options, + "gres->NodeName": gres_nodename_options, + "job_container->NodeName": job_container_nodename_options, + "job_container->BaseName": job_container_basename_options, + "topology->SwitchName": topology_switchname_options, + "topology->BlockName": topology_blockname_options, + "helpers->NodeName": helpers_nodename_options, + "helpers->Feature": helpers_feature_options } _HOSTLIST_RE = re.compile( r'^(?P[^\[\]]*)\[(?P[^\[\]]+)\](?P.*)$') +def validate_config_types(conf_dict, conf_name, module): + """Validate configuration keys and value types based on SlurmParserEnum.""" + current_conf = all_confs.get(conf_name, {}) + if not current_conf: + return {'invalid_keys': [], 'type_errors': []} + invalid_keys = list( + set(conf_dict.keys()).difference(set(current_conf.keys()))) + type_errors = [] + + for key, value in conf_dict.items(): + if key in current_conf: + expected_type_enum = current_conf[key] + expected_type = expected_type_enum.value + error = None + + if expected_type == "int": + if not isinstance(value, int): + try: + int(str(value)) + except (ValueError, TypeError): + error = f"Expected integer, got {type(value).__name__}" + + elif expected_type == "float": + if not isinstance(value, (int, float)): + try: + float(str(value)) + except (ValueError, TypeError): + error = f"Expected float, got {type(value).__name__}" + + elif expected_type == "bool": + if not isinstance(value, bool): + if str(value).lower() not in [ + 'yes', 'no', 'true', 'false', '0', '1']: + error = f"Expected boolean, got {type(value).__name__}" + + elif expected_type == "str": + if not isinstance(value, str): + error = f"Expected string, got {type(value).__name__}" + + elif expected_type == "csv": + if not isinstance(value, str): + error = f"Expected CSV string, got {type(value).__name__}" + + elif expected_type == "list": + if not isinstance(value, list): + error = f"Expected list, got {type(value).__name__}" + + elif expected_type == "array": + if not isinstance(value, list): + error = f"Expected array (list), got {type(value).__name__}" + elif value: + if not all(isinstance(item, dict) for item in value): + error = "Expected array of dicts, got mixed types" + else: + # Recursively validate each dict item in the array + for item in value: + item_result = validate_config_types( + item, f"{conf_name}->{key}", module) + type_errors.extend(item_result['type_errors']) + invalid_keys.extend(item_result['invalid_keys']) + elif expected_type == "object": + if not isinstance(value, (dict, object)): + error = f"Expected object, got {type(value).__name__}" + + if error: + type_errors.append({ # format for error message in input validator + "error_key": "omnia_config.yml", + "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'", + "error_value": "slurm_cluster->config_sources" + }) + return { + 'invalid_keys': list(invalid_keys), + 'type_errors': type_errors + } + + +def parse_slurm_conf(file_path, conf_name, validate): + """Parses the slurm.conf file and returns it as a dictionary.""" + current_conf = all_confs.get(conf_name, {}) + slurm_dict = OrderedDict() + dup_keys = [] + + if not os.path.exists(file_path): + raise FileNotFoundError(f"{file_path} not found.") + + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + # handles any comment after the data + line = line.split('#')[0].strip() + if not line: + continue + # Split the line by one or more spaces + items = line.split() + tmp_dict = OrderedDict() + for item in items: + # Split only on the first '=' to allow '=' inside the value + key, value = item.split('=', 1) + tmp_dict[key.strip()] = value.strip() + skey = list(tmp_dict.keys())[0] + if validate and skey not in current_conf: + raise ValueError( + f"Invalid key while parsing {file_path}: {skey}") + if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY or len(tmp_dict) > 1: + slurm_dict[list(tmp_dict.keys())[0]] = list( + slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict] + elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV: + existing_values = [ + v.strip() for v in slurm_dict.get( + skey, "").split(',') if v.strip()] + new_values = [v.strip() + for v in tmp_dict[skey].split(',') if v.strip()] + slurm_dict[skey] = ",".join( + list( + dict.fromkeys( + existing_values + + new_values))) + elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST: + slurm_dict[skey] = list(slurm_dict.get( + skey, [])) + list(tmp_dict.values()) + else: + if skey in slurm_dict: + dup_keys.append(skey) + else: + slurm_dict.update(tmp_dict) + return slurm_dict, dup_keys + + def expand_hostlist(expr): """ Expand simple Slurm-style hostlist expressions, e.g.: diff --git a/common/library/module_utils/input_validation/schema/local_repo_config.json b/common/library/module_utils/input_validation/schema/local_repo_config.json index 664b02b20c..e44cf44df7 100644 --- a/common/library/module_utils/input_validation/schema/local_repo_config.json +++ b/common/library/module_utils/input_validation/schema/local_repo_config.json @@ -2,6 +2,67 @@ "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { + "user_registry": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "host": { + "type": "string", + "minLength": 1, + "pattern": "^[a-zA-Z0-9.-]+:[0-9]+$" + }, + "cert_path": { + "type": "string", + "pattern": "^$|^[a-zA-Z0-9/\\._-]*\\.crt$" + }, + "key_path": { + "type": "string", + "pattern": "^$|^[a-zA-Z0-9/\\._-]*\\.key$" + } + }, + "required": [ + "host" + ], + "allOf": [ + { + "if": { + "properties": { + "cert_path": { + "minLength": 1 + } + } + }, + "then": { + "properties": { + "cert_path": { + "pattern": "^[a-zA-Z0-9/\\._-]*\\.crt$" + } + } + } + }, + { + "if": { + "properties": { + "key_path": { + "minLength": 1 + } + } + }, + "then": { + "properties": { + "key_path": { + "pattern": "^[a-zA-Z0-9/\\._-]*\\.key$" + } + } + } + } + ] + } + }, "user_repo_url_x86_64": { "type": [ "array", @@ -1082,4 +1143,4 @@ "omnia_repo_url_rhel_x86_64" ], "additionalProperties": false -} \ No newline at end of file +} diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index f53485770f..ca7266124c 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -19,6 +19,10 @@ "minLength": 1, "description": "Name of the nfs storage in storage_config.yml" }, + "skip_merge": { + "type": "boolean", + "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" + }, "config_sources": { "type": "object", "description": "Config can be a file path or inline mapping", diff --git a/common/library/module_utils/input_validation/schema/slurm_config_parameters.json b/common/library/module_utils/input_validation/schema/slurm_config_parameters.json new file mode 100644 index 0000000000..19480de228 --- /dev/null +++ b/common/library/module_utils/input_validation/schema/slurm_config_parameters.json @@ -0,0 +1,501 @@ +{ + "slurm.conf": { + "AccountingStorageBackupHost": "S_P_STRING", + "AccountingStorageEnforce": "S_P_STRING", + "AccountingStorageExternalHost": "S_P_STRING", + "AccountingStorageHost": "S_P_STRING", + "AccountingStorageParameters": "S_P_STRING", + "AccountingStoragePass": "S_P_STRING", + "AccountingStoragePort": "S_P_UINT16", + "AccountingStorageTRES": "S_P_STRING", + "AccountingStorageType": "S_P_STRING", + "AccountingStorageUser": "S_P_STRING", + "AccountingStoreFlags": "S_P_STRING", + "AccountingStoreJobComment": "S_P_BOOLEAN", + "AcctGatherEnergyType": "S_P_STRING", + "AcctGatherFilesystemType": "S_P_STRING", + "AcctGatherInfinibandType": "S_P_STRING", + "AcctGatherInterconnectType": "S_P_STRING", + "AcctGatherNodeFreq": "S_P_UINT16", + "AcctGatherProfileType": "S_P_STRING", + "AllowSpecResourcesUsage": "S_P_BOOLEAN", + "AuthAltParameters": "S_P_STRING", + "AuthAltTypes": "S_P_STRING", + "AuthInfo": "S_P_STRING", + "AuthType": "S_P_STRING", + "BackupAddr": "S_P_STRING", + "BackupController": "S_P_STRING", + "BatchStartTimeout": "S_P_UINT16", + "BcastExclude": "S_P_STRING", + "BcastParameters": "S_P_STRING", + "BurstBufferParameters": "S_P_STRING", + "BurstBufferType": "S_P_STRING", + "CertgenType": "S_P_STRING", + "CertgenParameters": "S_P_STRING", + "CertmgrType": "S_P_STRING", + "CertmgrParameters": "S_P_STRING", + "CliFilterParameters": "S_P_STRING", + "CliFilterPlugins": "S_P_STRING", + "ClusterName": "S_P_STRING", + "CommunicationParameters": "S_P_STRING", + "CompleteWait": "S_P_UINT16", + "ControlAddr": "S_P_STRING", + "ControlMachine": "S_P_STRING", + "CoreSpecPlugin": "S_P_STRING", + "CpuFreqDef": "S_P_STRING", + "CpuFreqGovernors": "S_P_STRING", + "CredType": "S_P_STRING", + "CryptoType": "S_P_STRING", + "DataParserParameters": "S_P_STRING", + "DebugFlags": "S_P_STRING", + "DefCPUPerGPU": "S_P_UINT64", + "DefMemPerCPU": "S_P_UINT64", + "DefMemPerGPU": "S_P_UINT64", + "DefMemPerNode": "S_P_UINT64", + "DependencyParameters": "S_P_STRING", + "DisableRootJobs": "S_P_BOOLEAN", + "EioTimeout": "S_P_UINT16", + "EnforcePartLimits": "S_P_STRING", + "Epilog": "S_P_ARRAY", + "EpilogMsgTime": "S_P_UINT32", + "EpilogSlurmctld": "S_P_ARRAY", + "EpilogTimeout": "S_P_UINT16", + "ExtSensorsFreq": "S_P_UINT16", + "ExtSensorsType": "S_P_STRING", + "FairShareDampeningFactor": "S_P_UINT16", + "FastSchedule": "S_P_UINT16", + "FederationParameters": "S_P_STRING", + "FirstJobId": "S_P_UINT32", + "GetEnvTimeout": "S_P_UINT16", + "GpuFreqDef": "S_P_STRING", + "GresTypes": "S_P_STRING", + "GroupUpdateForce": "S_P_UINT16", + "GroupUpdateTime": "S_P_UINT16", + "HashPlugin": "S_P_STRING", + "HealthCheckInterval": "S_P_UINT16", + "HealthCheckNodeState": "S_P_STRING", + "HealthCheckProgram": "S_P_STRING", + "HttpParserType": "S_P_STRING", + "InactiveLimit": "S_P_UINT16", + "InteractiveStepOptions": "S_P_STRING", + "JobAcctGatherFrequency": "S_P_STRING", + "JobAcctGatherParams": "S_P_STRING", + "JobAcctGatherType": "S_P_STRING", + "JobCompHost": "S_P_STRING", + "JobCompLoc": "S_P_STRING", + "JobCompParams": "S_P_STRING", + "JobCompPass": "S_P_STRING", + "JobCompPassScript": "S_P_STRING", + "JobCompPort": "S_P_UINT32", + "JobCompType": "S_P_STRING", + "JobCompUser": "S_P_STRING", + "JobContainerType": "S_P_STRING", + "JobCredentialPrivateKey": "S_P_STRING", + "JobCredentialPublicCertificate": "S_P_STRING", + "JobFileAppend": "S_P_UINT16", + "JobRequeue": "S_P_UINT16", + "JobSubmitPlugins": "S_P_STRING", + "KeepAliveTime": "S_P_UINT32", + "KillOnBadExit": "S_P_UINT16", + "KillWait": "S_P_UINT16", + "LaunchParameters": "S_P_STRING", + "LaunchType": "S_P_STRING", + "Licenses": "S_P_STRING", + "LogTimeFormat": "S_P_STRING", + "MailDomain": "S_P_STRING", + "MailProg": "S_P_STRING", + "MaxArraySize": "S_P_UINT32", + "MaxBatchRequeue": "S_P_UINT32", + "MaxDBDMsgs": "S_P_UINT32", + "MaxJobCount": "S_P_UINT32", + "MaxJobId": "S_P_UINT32", + "MaxMemPerCPU": "S_P_UINT64", + "MaxMemPerNode": "S_P_UINT64", + "MaxNodeCount": "S_P_UINT32", + "MaxStepCount": "S_P_UINT32", + "MaxTasksPerNode": "S_P_UINT16", + "MCSParameters": "S_P_STRING", + "MCSPlugin": "S_P_STRING", + "MessageTimeout": "S_P_UINT16", + "MetricsType": "S_P_STRING", + "MinJobAge": "S_P_UINT32", + "MpiDefault": "S_P_STRING", + "MpiParams": "S_P_STRING", + "NamespaceType": "S_P_STRING", + "NodeFeaturesPlugins": "S_P_STRING", + "OverTimeLimit": "S_P_UINT16", + "PluginDir": "S_P_STRING", + "PlugStackConfig": "S_P_STRING", + "PowerParameters": "S_P_STRING", + "PowerPlugin": "S_P_STRING", + "PreemptExemptTime": "S_P_STRING", + "PreemptMode": "S_P_STRING", + "PreemptParameters": "S_P_STRING", + "PreemptType": "S_P_STRING", + "PrEpParameters": "S_P_STRING", + "PrEpPlugins": "S_P_STRING", + "PriorityCalcPeriod": "S_P_STRING", + "PriorityDecayHalfLife": "S_P_STRING", + "PriorityFavorSmall": "S_P_BOOLEAN", + "PriorityFlags": "S_P_STRING", + "PriorityMaxAge": "S_P_STRING", + "PriorityParameters": "S_P_STRING", + "PrioritySiteFactorParameters": "S_P_STRING", + "PrioritySiteFactorPlugin": "S_P_STRING", + "PriorityType": "S_P_STRING", + "PriorityUsageResetPeriod": "S_P_STRING", + "PriorityWeightAge": "S_P_UINT32", + "PriorityWeightAssoc": "S_P_UINT32", + "PriorityWeightFairshare": "S_P_UINT32", + "PriorityWeightJobSize": "S_P_UINT32", + "PriorityWeightPartition": "S_P_UINT32", + "PriorityWeightQOS": "S_P_UINT32", + "PriorityWeightTRES": "S_P_STRING", + "PrivateData": "S_P_STRING", + "ProctrackType": "S_P_STRING", + "Prolog": "S_P_ARRAY", + "PrologEpilogTimeout": "S_P_UINT16", + "PrologFlags": "S_P_STRING", + "PrologSlurmctld": "S_P_ARRAY", + "PrologTimeout": "S_P_UINT16", + "PropagatePrioProcess": "S_P_UINT16", + "PropagateResourceLimits": "S_P_STRING", + "PropagateResourceLimitsExcept": "S_P_STRING", + "RebootProgram": "S_P_STRING", + "ReconfigFlags": "S_P_STRING", + "RequeueExit": "S_P_STRING", + "RequeueExitHold": "S_P_STRING", + "ResumeFailProgram": "S_P_STRING", + "ResumeProgram": "S_P_STRING", + "ResumeRate": "S_P_UINT16", + "ResumeTimeout": "S_P_UINT16", + "ResvEpilog": "S_P_STRING", + "ResvOverRun": "S_P_UINT16", + "ResvProlog": "S_P_STRING", + "ReturnToService": "S_P_UINT16", + "RoutePlugin": "S_P_STRING", + "SallocDefaultCommand": "S_P_STRING", + "SbcastParameters": "S_P_STRING", + "SchedulerParameters": "S_P_STRING", + "SchedulerTimeSlice": "S_P_UINT16", + "SchedulerType": "S_P_STRING", + "ScronParameters": "S_P_STRING", + "SelectType": "S_P_STRING", + "SelectTypeParameters": "S_P_STRING", + "SlurmctldAddr": "S_P_STRING", + "SlurmctldDebug": "S_P_STRING", + "SlurmctldLogFile": "S_P_STRING", + "SlurmctldParameters": "S_P_STRING", + "SlurmctldPidFile": "S_P_STRING", + "SlurmctldPort": "S_P_STRING", + "SlurmctldPrimaryOffProg": "S_P_STRING", + "SlurmctldPrimaryOnProg": "S_P_STRING", + "SlurmctldSyslogDebug": "S_P_STRING", + "SlurmctldTimeout": "S_P_UINT16", + "SlurmdDebug": "S_P_STRING", + "SlurmdLogFile": "S_P_STRING", + "SlurmdParameters": "S_P_STRING", + "SlurmdPidFile": "S_P_STRING", + "SlurmdPort": "S_P_UINT32", + "SlurmdSpoolDir": "S_P_STRING", + "SlurmdSyslogDebug": "S_P_STRING", + "SlurmdTimeout": "S_P_UINT16", + "SlurmdUser": "S_P_STRING", + "SlurmSchedLogFile": "S_P_STRING", + "SlurmSchedLogLevel": "S_P_UINT16", + "SlurmUser": "S_P_STRING", + "SrunEpilog": "S_P_STRING", + "SrunPortRange": "S_P_STRING", + "SrunProlog": "S_P_STRING", + "StateSaveLocation": "S_P_STRING", + "SuspendExcNodes": "S_P_STRING", + "SuspendExcParts": "S_P_STRING", + "SuspendExcStates": "S_P_STRING", + "SuspendProgram": "S_P_STRING", + "SuspendRate": "S_P_UINT16", + "SuspendTime": "S_P_STRING", + "SuspendTimeout": "S_P_UINT16", + "SwitchParameters": "S_P_STRING", + "SwitchType": "S_P_STRING", + "TaskEpilog": "S_P_STRING", + "TaskPlugin": "S_P_STRING", + "TaskPluginParam": "S_P_STRING", + "TaskProlog": "S_P_STRING", + "TCPTimeout": "S_P_UINT16", + "TLSParameters": "S_P_STRING", + "TLSType": "S_P_STRING", + "TmpFS": "S_P_STRING", + "TopologyParam": "S_P_STRING", + "TopologyPlugin": "S_P_STRING", + "TrackWCKey": "S_P_BOOLEAN", + "TreeWidth": "S_P_UINT16", + "UnkillableStepProgram": "S_P_STRING", + "UnkillableStepTimeout": "S_P_UINT16", + "UrlParserType": "S_P_STRING", + "UsePAM": "S_P_BOOLEAN", + "VSizeFactor": "S_P_UINT16", + "WaitTime": "S_P_UINT16", + "X11Parameters": "S_P_STRING", + "DownNodes": "S_P_ARRAY", + "NodeName": "S_P_ARRAY", + "NodeSet": "S_P_ARRAY", + "PartitionName": "S_P_ARRAY", + "SlurmctldHost": "S_P_ARRAY" + }, + "slurmdbd.conf": { + "AllowNoDefAcct": "S_P_BOOLEAN", + "AllResourcesAbsolute": "S_P_BOOLEAN", + "ArchiveDir": "S_P_STRING", + "ArchiveEvents": "S_P_BOOLEAN", + "ArchiveJobs": "S_P_BOOLEAN", + "ArchiveResvs": "S_P_BOOLEAN", + "ArchiveScript": "S_P_STRING", + "ArchiveSteps": "S_P_BOOLEAN", + "ArchiveSuspend": "S_P_BOOLEAN", + "ArchiveTXN": "S_P_BOOLEAN", + "ArchiveUsage": "S_P_BOOLEAN", + "AuthAltTypes": "S_P_STRING", + "AuthAltParameters": "S_P_STRING", + "AuthInfo": "S_P_STRING", + "AuthType": "S_P_STRING", + "CommitDelay": "S_P_UINT16", + "CommunicationParameters": "S_P_STRING", + "DbdAddr": "S_P_STRING", + "DbdBackupHost": "S_P_STRING", + "DbdHost": "S_P_STRING", + "DbdPort": "S_P_UINT16", + "DebugFlags": "S_P_STRING", + "DebugLevel": "S_P_STRING", + "DebugLevelSyslog": "S_P_STRING", + "DefaultQOS": "S_P_STRING", + "DisableCoordDBD": "S_P_BOOLEAN", + "DisableArchiveCommands": "S_P_BOOLEAN", + "HashPlugin": "S_P_STRING", + "JobPurge": "S_P_UINT32", + "LogFile": "S_P_STRING", + "LogTimeFormat": "S_P_STRING", + "MaxPurgeLimit": "S_P_UINT32", + "MaxQueryTimeRange": "S_P_STRING", + "MessageTimeout": "S_P_UINT16", + "Parameters": "S_P_STRING", + "PidFile": "S_P_STRING", + "PluginDir": "S_P_STRING", + "PrivateData": "S_P_STRING", + "PurgeEventAfter": "S_P_STRING", + "PurgeJobAfter": "S_P_STRING", + "PurgeResvAfter": "S_P_STRING", + "PurgeStepAfter": "S_P_STRING", + "PurgeSuspendAfter": "S_P_STRING", + "PurgeTXNAfter": "S_P_STRING", + "PurgeUsageAfter": "S_P_STRING", + "PurgeEventMonths": "S_P_UINT32", + "PurgeJobMonths": "S_P_UINT32", + "PurgeStepMonths": "S_P_UINT32", + "PurgeSuspendMonths": "S_P_UINT32", + "PurgeTXNMonths": "S_P_UINT32", + "PurgeUsageMonths": "S_P_UINT32", + "SlurmUser": "S_P_STRING", + "StepPurge": "S_P_UINT32", + "StorageBackupHost": "S_P_STRING", + "StorageHost": "S_P_STRING", + "StorageLoc": "S_P_STRING", + "StorageParameters": "S_P_STRING", + "StoragePass": "S_P_STRING", + "StoragePassScript": "S_P_STRING", + "StoragePort": "S_P_UINT16", + "StorageType": "S_P_STRING", + "StorageUser": "S_P_STRING", + "TCPTimeout": "S_P_UINT16", + "TLSParameters": "S_P_STRING", + "TLSType": "S_P_STRING", + "TrackWCKey": "S_P_BOOLEAN", + "TrackSlurmctldDown": "S_P_BOOLEAN" + }, + "cgroup.conf": { + "CgroupAutomount": "S_P_BOOLEAN", + "CgroupMountpoint": "S_P_STRING", + "CgroupSlice": "S_P_STRING", + "ConstrainCores": "S_P_BOOLEAN", + "ConstrainRAMSpace": "S_P_BOOLEAN", + "AllowedRAMSpace": "S_P_FLOAT", + "MaxRAMPercent": "S_P_FLOAT", + "MinRAMSpace": "S_P_UINT64", + "ConstrainSwapSpace": "S_P_BOOLEAN", + "AllowedSwapSpace": "S_P_FLOAT", + "MaxSwapPercent": "S_P_FLOAT", + "MemoryLimitEnforcement": "S_P_BOOLEAN", + "MemoryLimitThreshold": "S_P_FLOAT", + "ConstrainDevices": "S_P_BOOLEAN", + "AllowedDevicesFile": "S_P_STRING", + "MemorySwappiness": "S_P_UINT64", + "CgroupPlugin": "S_P_STRING", + "IgnoreSystemd": "S_P_BOOLEAN", + "IgnoreSystemdOnFailure": "S_P_BOOLEAN", + "EnableControllers": "S_P_BOOLEAN", + "EnableExtraControllers": "S_P_STRING", + "SignalChildrenProcesses": "S_P_BOOLEAN", + "SystemdTimeout": "S_P_UINT64" + }, + "gres.conf": { + "AutoDetect": "S_P_STRING", + "Count": "S_P_STRING", + "CPUs": "S_P_STRING", + "Cores": "S_P_STRING", + "File": "S_P_STRING", + "Files": "S_P_STRING", + "Flags": "S_P_STRING", + "Link": "S_P_STRING", + "Links": "S_P_STRING", + "MultipleFiles": "S_P_STRING", + "Name": "S_P_STRING", + "Type": "S_P_STRING" + }, + "oci.conf": { + "ContainerPath": "S_P_STRING", + "CreateEnvFile": "S_P_STRING", + "DisableHooks": "S_P_STRING", + "EnvExclude": "S_P_STRING", + "MountSpoolDir": "S_P_STRING", + "RunTimeCreate": "S_P_STRING", + "RunTimeDelete": "S_P_STRING", + "RunTimeKill": "S_P_STRING", + "RunTimeEnvExclude": "S_P_STRING", + "RunTimeQuery": "S_P_STRING", + "RunTimeRun": "S_P_STRING", + "RunTimeStart": "S_P_STRING", + "SrunPath": "S_P_STRING", + "SrunArgs": "S_P_ARRAY", + "DisableCleanup": "S_P_BOOLEAN", + "StdIODebug": "S_P_STRING", + "SyslogDebug": "S_P_STRING", + "FileDebug": "S_P_STRING", + "DebugFlags": "S_P_STRING", + "IgnoreFileConfigJson": "S_P_BOOLEAN" + }, + "acct_gather.conf": { + "EnergyIPMIDriverType": "S_P_UINT32", + "EnergyIPMIDisableAutoProbe": "S_P_UINT32", + "EnergyIPMIDriverAddress": "S_P_UINT32", + "EnergyIPMIRegisterSpacing": "S_P_UINT32", + "EnergyIPMIDriverDevice": "S_P_STRING", + "EnergyIPMIProtocolVersion": "S_P_UINT32", + "EnergyIPMIUsername": "S_P_STRING", + "EnergyIPMIPassword": "S_P_STRING", + "EnergyIPMIPrivilegeLevel": "S_P_UINT32", + "EnergyIPMIAuthenticationType": "S_P_UINT32", + "EnergyIPMICipherSuiteId": "S_P_UINT32", + "EnergyIPMISessionTimeout": "S_P_UINT32", + "EnergyIPMIRetransmissionTimeout": "S_P_UINT32", + "EnergyIPMIWorkaroundFlags": "S_P_UINT32", + "EnergyIPMIRereadSdrCache": "S_P_BOOLEAN", + "EnergyIPMIIgnoreNonInterpretableSensors": "S_P_BOOLEAN", + "EnergyIPMIBridgeSensors": "S_P_BOOLEAN", + "EnergyIPMIInterpretOemData": "S_P_BOOLEAN", + "EnergyIPMISharedSensors": "S_P_BOOLEAN", + "EnergyIPMIDiscreteReading": "S_P_BOOLEAN", + "EnergyIPMIIgnoreScanningDisabled": "S_P_BOOLEAN", + "EnergyIPMIAssumeBmcOwner": "S_P_BOOLEAN", + "EnergyIPMIEntitySensorNames": "S_P_BOOLEAN", + "EnergyIPMIFrequency": "S_P_UINT32", + "EnergyIPMICalcAdjustment": "S_P_BOOLEAN", + "EnergyIPMIPowerSensors": "S_P_STRING", + "EnergyIPMITimeout": "S_P_UINT32", + "EnergyIPMIVariable": "S_P_STRING", + "ProfileHDF5Dir": "S_P_STRING", + "ProfileHDF5Default": "S_P_STRING", + "ProfileInfluxDBDatabase": "S_P_STRING", + "ProfileInfluxDBDefault": "S_P_STRING", + "ProfileInfluxDBFrequency": "S_P_UINT32", + "ProfileInfluxDBHost": "S_P_STRING", + "ProfileInfluxDBPass": "S_P_STRING", + "ProfileInfluxDBRTPolicy": "S_P_STRING", + "ProfileInfluxDBTimeout": "S_P_UINT32", + "ProfileInfluxDBUser": "S_P_STRING", + "InterconnectOFEDPort": "S_P_UINT32", + "InfinibandOFEDPort": "S_P_UINT32", + "SysfsInterfaces": "S_P_STRING" + }, + "burst_buffer.conf": { + "AllowUsers": "S_P_STRING", + "CreateBuffer": "S_P_STRING", + "DefaultPool": "S_P_STRING", + "DenyUsers": "S_P_STRING", + "DestroyBuffer": "S_P_STRING", + "Directive": "S_P_STRING", + "Flags": "S_P_STRING", + "GetSysState": "S_P_STRING", + "GetSysStatus": "S_P_STRING", + "Granularity": "S_P_STRING", + "OtherTimeout": "S_P_UINT32", + "PollInterval": "S_P_UINT32", + "Pools": "S_P_STRING", + "StageInTimeout": "S_P_UINT32", + "StageOutTimeout": "S_P_UINT32", + "StartStageIn": "S_P_STRING", + "StartStageOut": "S_P_STRING", + "StopStageIn": "S_P_STRING", + "StopStageOut": "S_P_STRING", + "ValidateTimeout": "S_P_UINT32" + }, + "helpers.conf": { + "AllowUserBoot": "S_P_STRING", + "BootTime": "S_P_UINT32", + "ExecTime": "S_P_UINT32", + "Feature": "S_P_ARRAY", + "MutuallyExclusive": "S_P_LIST", + "NodeName": "S_P_ARRAY" + }, + "job_container.conf": { + "AutoBasePath": "S_P_BOOLEAN", + "BasePath": "S_P_ARRAY", + "EntireStepInNS": "S_P_BOOLEAN", + "InitScript": "S_P_STRING", + "Shared": "S_P_BOOLEAN", + "CloneNSScript": "S_P_STRING", + "CloneNSEpilog": "S_P_STRING", + "CloneNSScript_Wait": "S_P_UINT32", + "CloneNSEpilog_Wait": "S_P_UINT32" + }, + "mpi.conf": { + "PMIxCliTmpDirBase": "S_P_STRING", + "PMIxCollFence": "S_P_STRING", + "PMIxDebug": "S_P_UINT32", + "PMIxDirectConn": "S_P_BOOLEAN", + "PMIxDirectConnEarly": "S_P_BOOLEAN", + "PMIxDirectConnUCX": "S_P_BOOLEAN", + "PMIxDirectSameArch": "S_P_BOOLEAN", + "PMIxEnv": "S_P_STRING", + "PMIxFenceBarrier": "S_P_BOOLEAN", + "PMIxNetDevicesUCX": "S_P_STRING", + "PMIxShareServerTopology": "S_P_BOOLEAN", + "PMIxTimeout": "S_P_UINT32", + "PMIxTlsUCX": "S_P_CSV" + }, + "topology.conf": { + "SwitchName": "S_P_ARRAY", + "LinkSpeed": "S_P_UINT32", + "Nodes": "S_P_STRING", + "Switches": "S_P_STRING", + "BlockName": "S_P_ARRAY", + "BlockSizes": "S_P_STRING" + }, + "type_definitions": { + "S_P_IGNORE": "Any instance of specified key and associated value in a file will be allowed, but the value will not be stored", + "S_P_STRING": "String value", + "S_P_PLAIN_STRING": "Plain string value (not expanded in S_P_EXPLINE contexts)", + "S_P_LONG": "Long integer value", + "S_P_UINT16": "Unsigned 16-bit integer", + "S_P_UINT32": "Unsigned 32-bit integer", + "S_P_UINT64": "Unsigned 64-bit integer", + "S_P_POINTER": "Pointer type (custom handler)", + "S_P_ARRAY": "Array of values (allows multiple occurrences)", + "S_P_LIST": "List of values (allows multiple occurrences)", + "S_P_CSV": "Comma-separated values", + "S_P_BOOLEAN": "Boolean value (true/false, yes/no)", + "S_P_LINE": "Nested configuration line with sub-options", + "S_P_EXPLINE": "Expanded line with hostlist expansion support", + "S_P_FLOAT": "Floating point value", + "S_P_DOUBLE": "Double precision floating point", + "S_P_LONG_DOUBLE": "Long double precision floating point" + } +} diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json index 41746905f1..e300410346 100644 --- a/common/library/module_utils/input_validation/schema/storage_config.json +++ b/common/library/module_utils/input_validation/schema/storage_config.json @@ -51,7 +51,8 @@ "minItems": 1 }, "powervault_config": { - "required": ["ip", "isci_initiators", "volume_id"], + "type": "object", + "required": ["ip", "iscsi_initiator", "volume_id"], "properties": { "ip": { "description": "List of target controller IP addresses", @@ -69,14 +70,16 @@ "type": "integer" }, - "isci_initiators": { + "iscsi_initiator": { "description": "iSCSI initiator IQN", - "type": "string" + "type": "string", + "pattern": "^iqn\\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$" }, "volume_id": { "description": "Volume identifier (hex string)", - "type": "string" + "type": "string", + "pattern": "^[a-fA-F0-9]+$" } } } diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 06f33be0e4..36f55130d4 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -36,11 +36,14 @@ from ansible.module_utils.local_repo.software_utils import ( load_json, - load_yaml, get_subgroup_dict, get_software_names, get_json_file_path ) +from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import ( + parse_slurm_conf, + validate_config_types +) file_names = config.files create_error_msg = validation_utils.create_error_msg @@ -230,6 +233,19 @@ def validate_software_config( ) ) + # Check for required subgroups when specific software names are present + software_requiring_subgroups = ["additional_packages", "slurm_custom", "service_k8s"] + for software_name in software_requiring_subgroups: + if software_name in software_names: + if software_name not in data or not data[software_name]: + errors.append( + create_error_msg( + "Validation Error: ", + software_name, + f"is present in softwares but corresponding subgroup '{software_name}' is missing or empty in software_config.json. Please refer examples directory for the correct format." + ) + ) + for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') @@ -1058,17 +1074,37 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - # config_paths_list = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] - # for cfg_path_dict in config_paths_list: - # for k,v in cfg_path_dict.items(): - # if isinstance(v, str) and not os.path.exists(v): - # errors.append( - # create_error_msg( - # input_file_path, - # "slurm config_paths", - # f"config_path for {k} - {v} does not exist" - # )) + skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") + cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')] + for idx, cfg_path_dict in enumerate(cnfg_src): + skip_merge = skip_merge_list[idx] + for k,v in cfg_path_dict.items(): + conf_dict = None + if isinstance(v, str): + if not os.path.exists(v): + errors.append( + create_error_msg('omnia_config.yml', "slurm_cluster config_sources", + f"provided conf path for {k} - {v} does not exist")) + continue + else: # path exists + if not skip_merge and not skip_conf_validation: + conf_dict, duplicate_keys = parse_slurm_conf(v, k, False) + if duplicate_keys: + errors.append( + create_error_msg('omnia_config.yml', "slurm_cluster->config_sources", + f"duplicate keys found in {k}.conf - {','.join(duplicate_keys)}")) + else: + conf_dict = v + if conf_dict and not skip_conf_validation: + validation_result = validate_config_types(conf_dict, k, module) + if validation_result.get('type_errors'): + errors.extend(validation_result['type_errors']) + if validation_result.get('invalid_keys'): + errors.append( + create_error_msg('omnia_config.yml', "slurm_cluster->config_sources", + f"{k}.conf invalid keys found - {','.join(validation_result['invalid_keys'])}")) return errors def check_is_service_cluster_functional_groups_defined( diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index ee2dd12a29..343a4f3de1 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,43 +29,77 @@ def check_subscription_status(logger=None): """ - Check if the system has an active Red Hat subscription. - Subscription status is considered True if either entitlement - certificates exist or the required Red Hat repository URLs are present. - - Checks mounted host paths (/etc/pki/entitlement, /etc/yum.repos.d/redhat.repo). + Check if the system has an active Red Hat subscription enabled. + If system entitlement certificates are found in /etc/pki/entitlement, + only system paths are checked. Otherwise, Omnia paths are checked. + Subscription is enabled only if entitlement certificates and required + Red Hat repository URLs are found in the same source (system or Omnia). Returns: - bool: True if the system is subscribed (either entitlement certs - exist or required repos are present), False otherwise. - """ - # 1. Check entitlement certs - entitlement_certs = glob.glob(config.ENTITLEMENT_PEM) - has_entitlement = len(entitlement_certs) > 0 - if logger: - logger.info(f"Entitlement certs in {config.ENTITLEMENT_PEM}: {len(entitlement_certs)} found") - - # 2. Check redhat repos in redhat.repo + bool: True if subscription is enabled (both entitlement certs + and repos are found in the same source), False otherwise. + """ + # 1. Check system entitlement certs first + system_entitlement_certs = glob.glob(config.SYSTEM_ENTITLEMENT_PATH) + has_system_entitlement = len(system_entitlement_certs) > 0 + + if has_system_entitlement: + # System entitlement found - use system paths only + entitlement_certs = system_entitlement_certs + has_entitlement = True + repo_file_to_check = config.SYSTEM_REDHAT_REPO + + if logger: + logger.info(f"Found {len(system_entitlement_certs)} system entitlement certs - using system paths only") + else: + # No system entitlement - check Omnia paths + omnia_entitlement_certs = glob.glob(config.OMNIA_ENTITLEMENT_PATH) + entitlement_certs = omnia_entitlement_certs + has_entitlement = len(omnia_entitlement_certs) > 0 + repo_file_to_check = config.OMNIA_REDHAT_REPO + + if logger: + logger.info(f"No system entitlement found - checking Omnia paths: {len(omnia_entitlement_certs)} certs found") + + # 2. Check repos based on which entitlement path was used + has_repos = False repo_urls = [] - redhat_repo = config.REDHAT_REPO_FILE - if os.path.exists(redhat_repo): - with open(redhat_repo, "r") as f: - for line in f: - if line.startswith("baseurl ="): - url = line.split("=", 1)[1].strip() - if re.search(r"(codeready-builder|baseos|appstream)", url, re.IGNORECASE): - repo_urls.append(url) - - has_repos = len(repo_urls) > 0 - if logger: - logger.info(f"Repo URLs in {redhat_repo}: {len(repo_urls)} found") - - # 3. Subscription status logic - subscription_status = has_entitlement or has_repos + redhat_repo_used = None + + if os.path.exists(repo_file_to_check): + try: + with open(repo_file_to_check, "r") as f: + for line in f: + if line.startswith("baseurl ="): + url = line.split("=", 1)[1].strip() + if re.search(r"(codeready-builder|baseos|appstream)", url, re.IGNORECASE): + repo_urls.append(url) + + if repo_urls: + has_repos = True + redhat_repo_used = repo_file_to_check + if logger: + logger.info(f"Found {len(repo_urls)} repo URLs in {repo_file_to_check}") + elif logger: + logger.info(f"No required repo URLs found in {repo_file_to_check}") + except (IOError, OSError) as e: + if logger: + logger.warning(f"Error reading {repo_file_to_check}: {e}") + elif logger: + logger.info(f"Repo file {repo_file_to_check} does not exist") + + # 3. Subscription enabled if entitlement and repos are found in the same source + subscription_enabled = has_entitlement and has_repos + if logger: - logger.info(f"Subscription status: {subscription_status} (entitlement={has_entitlement}, repos={has_repos})") + logger.info( + f"Subscription enabled: {subscription_enabled} " + f"(entitlement={has_entitlement}, repos={has_repos}, " + f"entitlement_source={entitlement_certs[0] if entitlement_certs else 'None'}, " + f"repo_source={redhat_repo_used})" + ) - return subscription_status + return subscription_enabled # Below is a validation function for each file in the input folder def validate_local_repo_config(input_file_path, data, @@ -78,6 +112,22 @@ def validate_local_repo_config(input_file_path, data, errors = [] base_repo_names = [] local_repo_yml = create_file_path(input_file_path, file_names["local_repo_config"]) + + user_registry = data.get("user_registry") + if user_registry: + for registry in user_registry: + host = registry.get("host") + cert_path = registry.get("cert_path") + key_path = registry.get("key_path") + + # Validate user_registry certificate and key paths + if cert_path and not os.path.exists(cert_path): + errors.append(create_error_msg(local_repo_yml, "user_registry", + f"Certificate file not found: {cert_path}")) + + if key_path and not os.path.exists(key_path): + errors.append(create_error_msg(local_repo_yml, "user_registry", + f"Key file not found: {key_path}")) repo_names = {} sub_result = check_subscription_status(logger) logger.info(f"validate_local_repo_config: Subscription status: {sub_result}") diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 7eef7bef20..cc6b4d8e76 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -91,6 +91,65 @@ def validate_functional_groups_separation(pxe_mapping_file_path): if errors: raise ValueError("PXE mapping file group separation validation errors: " + "; ".join([str(e) for e in errors])) +def validate_slurm_login_compiler_prefix(pxe_mapping_file_path): + """Validate that slurm_node and login_compiler entries align on architecture suffix when both are present. + + - Functional group suffix must be either _x86_64 or _aarch64 (case-sensitive). + - When both slurm_node* and login_compiler_node* are present, their suffixes must match. + + Raises ValueError with details if suffixes differ. Prefix differences are allowed. + """ + + if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): + raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}") + + with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: + raw_lines = fh.readlines() + + non_comment_lines = [ln for ln in raw_lines if ln.strip()] + reader = csv.DictReader(non_comment_lines) + + fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} + fg_col = fieldname_map.get("FUNCTIONAL_GROUP_NAME") + hostname_col = fieldname_map.get("HOSTNAME") + + if not fg_col or not hostname_col: + raise ValueError("FUNCTIONAL_GROUP_NAME or HOSTNAME column not found in PXE mapping file") + + arch_map = {"slurm_node": [], "login_compiler_node": []} + + for row_idx, row in enumerate(reader, start=2): + fg_name = row.get(fg_col, "").strip() if row.get(fg_col) else "" + hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else "" + if not fg_name or not hostname: + continue + + fg_arch = None + fg_base = fg_name + for suffix in ("_x86_64", "_aarch64"): + if fg_name.endswith(suffix): + fg_arch = suffix.lstrip("_") + fg_base = fg_name[: -len(suffix)] + break + + if fg_base in arch_map and fg_arch: + arch_map[fg_base].append((fg_arch, row_idx)) + + if not arch_map["slurm_node"] or not arch_map["login_compiler_node"]: + return + + slurm_arch, _ = arch_map["slurm_node"][0] + login_arch, _ = arch_map["login_compiler_node"][0] + if slurm_arch != login_arch: + slurm_rows = [str(r[1]) for r in arch_map["slurm_node"]] + login_rows = [str(r[1]) for r in arch_map["login_compiler_node"]] + raise ValueError( + "Architecture suffix mismatch between slurm_node and login_compiler_node. " + f"slurm_node suffix '{slurm_arch}' vs " + f"login_compiler_node suffix '{login_arch}' " + "Ensure both use the same suffix (_x86_64 or _aarch64)." + ) + def validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path): """ Validates that HOSTNAME values in the mapping file are unique. @@ -684,6 +743,7 @@ def validate_provision_config( validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path) validate_functional_groups_separation(pxe_mapping_file_path) validate_parent_service_tag_hierarchy(pxe_mapping_file_path) + validate_slurm_login_compiler_prefix(pxe_mapping_file_path) # Validate ADMIN_IPs against network_spec.yml ranges network_spec_path = create_file_path(input_file_path, file_names["network_spec"]) diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 9c9af639fb..7bfea4b301 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,10 +33,10 @@ DEFAULT_REPO_STORE_PATH = "/tmp/offline_repo" USER_JSON_FILE_DEFAULT = "" DEFAULT_STATUS_FILENAME = "status.csv" -STATUS_CSV_HEADER = 'name,type,status\n' +STATUS_CSV_HEADER = 'name,type,repo_name,status\n' SOFTWARE_CSV_HEADER = "name,status" -USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml" -USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key" +# USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml" +# USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key" # ---------------------------- # Software tasklist Defaults # Used by prepare_tasklist.py @@ -51,7 +51,7 @@ # Used by software_utils.py # ---------------------------- PACKAGE_TYPES = ['rpm', 'deb', 'tarball', 'image', 'manifest', 'git', - 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list'] + 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file', 'rpm_repo'] CSV_COLUMNS = {"column1": "name", "column2": "status"} SOFTWARE_CONFIG_SUBDIR = "config" RPM_LABEL_TEMPLATE = "RPMs for {key}" @@ -64,6 +64,10 @@ "x86_64": ["dnf", "download", "--resolve", "--alldeps", "--arch=x86_64,noarch"], "aarch64": ["dnf", "download", "--forcearch", "aarch64", "--resolve", "--alldeps", "--exclude=*.x86_64"] } +DNF_INFO_COMMANDS = { + "x86_64": ["dnf", "info", "--quiet"], + "aarch64": ["dnf", "info", "--quiet", "--forcearch=aarch64"] +} # ---------------------------- # Used by download_common.py @@ -78,12 +82,34 @@ "show_distribution": "pulp file distribution show --name %s", "distribution_create": "pulp file distribution create --name %s --base-path %s --repository %s", "distribution_update": "pulp file distribution update --name %s --base-path %s --repository %s", + + # Cleanup commands + "delete_repository": "pulp file repository destroy --name %s", + "delete_distribution": "pulp file distribution destroy --name %s", + "delete_publication": "pulp file publication destroy --href %s", + "list_publications": "pulp file publication list --repository %s", + "list_repositories": "pulp file repository list", + "list_distributions": "pulp file distribution list", + "list_content": "pulp file content list --repository-version %s", + "show_repository_version": "pulp file repository version show --repository %s", + "orphan_cleanup": "pulp orphan cleanup --protection-time 0" +} + +# Pulp Python repository commands (for pip modules) +pulp_python_commands = { + "list_repositories": "pulp python repository list", + "show_repository": "pulp python repository show --name %s", + "delete_repository": "pulp python repository destroy --name %s", + "list_distributions": "pulp python distribution list", + "delete_distribution": "pulp python distribution destroy --name %s", + "orphan_cleanup": "pulp orphan cleanup --protection-time 0" } + CLI_FILE_PATH = "/root/.config/pulp/cli.toml" -POST_TIMEOUT = 3600 -TAR_POLL_VAL = 3 -FILE_POLL_VAL = 1 -ISO_POLL_VAL = 15 +POST_TIMEOUT = 3600 # seconds +TAR_POLL_VAL = 25 # minutes +FILE_POLL_VAL = 1 # minutes +ISO_POLL_VAL = 15 # minutes FILE_URI = "/pulp/api/v3/content/file/files/" PULP_SSL_CA_CERT = "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" # ---------------------------- @@ -107,11 +133,23 @@ "distribute_container_repository": "pulp container distribution create --name %s --repository %s --base-path %s", "update_container_distribution": "pulp container distribution update --name %s --repository %s --base-path %s", "list_container_remote_tags": "pulp container remote list --name %s --field include_tags", - "create_container_remote_auth": "pulp container remote create --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", - - "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'" - + "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", + # Cleanup commands + "delete_repository": "pulp container repository destroy --name %s", + "delete_remote": "pulp container remote destroy --name %s", + "delete_distribution": "pulp container distribution destroy --name %s", + "list_repositories": "pulp container repository list", + "list_remotes": "pulp container remote list", + "list_distributions": "pulp container distribution list", + # Tag-specific cleanup commands + "get_repo_version": "pulp container repository show --href %s", + "list_tags_by_version": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s", + "rename_repository": "pulp container repository update --name %s --new-name %s", + "orphan_cleanup": "pulp orphan cleanup", + "container_distribution_show": "pulp container distribution show --name %s | jq .repository", + "show_repository_version": "pulp container repository show --href %s | jq .latest_version_href", + "list_image_tags": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s" } OMNIA_CREDENTIALS_YAML_PATH = "/opt/omnia/input/project_default/omnia_config_credentials.yml" OMNIA_CREDENTIALS_VAULT_PATH = "/opt/omnia/input/project_default/.omnia_config_credentials_key" @@ -145,16 +183,50 @@ "check_distribution": "pulp rpm distribution show --name %s", "check_publication": "pulp rpm publication list --repository %s", "delete_publication": "pulp rpm publication destroy --href %s", - "get_repo_version": "pulp rpm repository show --name %s" + "get_repo_version": "pulp rpm repository show --name %s", + "list_repositories": "pulp rpm repository list", + "list_remotes": "pulp rpm remote list", + "list_distributions": "pulp rpm distribution list", + "orphan_cleanup": "pulp orphan cleanup --protection-time 0", + "list_all_publications": "pulp rpm publication list", + "upload_content": "pulp rpm content upload --repository %s --file %s", + "update_distribution_repo_config": "pulp rpm distribution update --name %s --generate-repo-config" } +# ---------------------------- +# Pulp Cleanup Configuration +# Used by pulp_cleanup.py and Ansible modules +# ---------------------------- + +# Default paths +CLEANUP_BASE_PATH_DEFAULT = "/opt/omnia/log/local_repo" +CLEANUP_STATUS_FILE_PATH_DEFAULT = "/opt/omnia/log/local_repo/cleanup_status.csv" +CLEANUP_LOG_PATH_DEFAULT = "/opt/omnia/log/local_repo/cleanup.log" + +# Default cleanup behavior +CLEANUP_DELETE_REMOTE_DEFAULT = True +CLEANUP_DELETE_DISTRIBUTION_DEFAULT = True +CLEANUP_CLEANUP_ORPHANS_AFTER_DEFAULT = True +CLEANUP_LIST_ONLY_DEFAULT = False +CLEANUP_FORCE_DEFAULT = False + +# Cleanup status values +CLEANUP_STATUS_SUCCESS = "Success" +CLEANUP_STATUS_FAILED = "Failed" +CLEANUP_STATUS_IN_PROGRESS = "In Progress" + +# Cleanup status file settings +CLEANUP_STATUS_FILENAME = "cleanup_status.csv" +CLEANUP_STATUS_CSV_HEADER = "artifact_name,artifact_type,status,message,timestamp\n" +CLEANUP_LOG_FILE_PATH = "/opt/omnia/log/local_repo/cleanup.log" + # ---------------------------- # Additional Repos Aggregation Settings # Used by process_rpm_config.py for aggregated repos feature # Naming convention: _omnia-additional to match existing filter patterns # ---------------------------- ADDITIONAL_REPOS_KEY = "additional_repos" -AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional-repo" +AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional" AGGREGATED_REMOTE_NAME_TEMPLATE = "{arch}_omnia-additional-{name}" AGGREGATED_DISTRIBUTION_NAME_TEMPLATE = "{arch}_omnia-additional" AGGREGATED_BASE_PATH_TEMPLATE = "opt/omnia/offline_repo/cluster/{arch}/rhel/10.0/rpms/omnia-additional" diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py index d8d97465d8..e3f47869af 100644 --- a/common/library/module_utils/local_repo/container_repo_utils.py +++ b/common/library/module_utils/local_repo/container_repo_utils.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,13 @@ # limitations under the License. #pylint: disable=import-error,no-name-in-module +""" +Container repository utilities for Pulp operations. + +This module provides functions for creating, syncing, and managing +container repositories and distributions in Pulp. +""" + import multiprocessing from ansible.module_utils.local_repo.parse_and_download import execute_command from ansible.module_utils.local_repo.config import ( @@ -98,24 +105,136 @@ def create_container_distribution(repo_name,package_content,logger): logger.error(f"Error creating distribution {repo_name}: {e}") return False -def sync_container_repository(repo_name, remote_name, package_content, logger): +def sync_container_repository(repo_name, remote_name, package_content, logger, tag=None): """ Synchronizes and distribute container repository with a remote. Args: repo_name (str): The name of the repository. remote_name (str): The name of the remote. package_content (str): Upstream name. + logger: Logger instance. + tag (str, optional): The tag to validate in repository content. Returns: bool: True if the synchronization is successful, False otherwise. """ try: + logger.info(f"Getting repository version before sync for {repo_name}") + verify_command = pulp_container_commands["show_container_repo"] % repo_name + verify_result_before = execute_command(verify_command, logger, type_json=True) + + version_before = None + if (verify_result_before and isinstance(verify_result_before, dict) and + "stdout" in verify_result_before): + repo_data_before = verify_result_before["stdout"] + if isinstance(repo_data_before, dict): + version_before = repo_data_before.get("latest_version_href") + logger.info(f"Repository version before sync: {version_before}") + command = pulp_container_commands["sync_container_repository"] % (repo_name, remote_name) result = execute_command(command,logger) if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): + logger.error(f"Sync command failed for repository {repo_name}") return False - else: - result = create_container_distribution(repo_name,package_content,logger) - return result + + logger.info(f"Validating sync result for repository {repo_name}") + verify_result_after = execute_command(verify_command, logger, type_json=True) + + if (verify_result_after and isinstance(verify_result_after, dict) and + "stdout" in verify_result_after): + repo_data_after = verify_result_after["stdout"] + if isinstance(repo_data_after, dict): + version_after = repo_data_after.get("latest_version_href") + logger.info(f"Repository version after sync: {version_after}") + + if not version_after or version_after.endswith("/versions/0/"): + logger.error(f"Sync completed but no content was downloaded for {repo_name}. " + f"The specified image tag likely does not exist in the upstream registry.") + return False + + if version_before and version_after and version_before == version_after: + # Check if tag actually exists using precise Pulp commands + try: + # Step 1: Get distribution to find repository href + dist_command = f"pulp container distribution show --name {repo_name}" + dist_result = execute_command(dist_command, logger, type_json=True) + + if not dist_result or not isinstance(dist_result, dict) or "stdout" not in dist_result: + logger.info(f"Distribution {repo_name} does not exist yet - skipping tag validation, will create distribution") + # Skip tag validation but continue to create distribution at line 221 + else: + # Distribution exists, validate the tag + dist_data = dist_result["stdout"] + if not isinstance(dist_data, dict) or "repository" not in dist_data: + logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.") + return False + repo_href = dist_data["repository"] + logger.info(f"Found repository href: {repo_href}") + + # Step 2: Get repository version href + repo_command = f"pulp container repository show --href {repo_href}" + repo_result = execute_command(repo_command, logger, type_json=True) + + if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result: + logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_data = repo_result["stdout"] + if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data: + logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_ver_href = repo_data["latest_version_href"] + logger.info(f"Found repository version href: {repo_ver_href}") + + # Step 3: Check if tag exists in content + tags_command = ( + f"pulp show --href " + f"'/pulp/api/v3/content/container/tags/" + f"?repository_version={repo_ver_href}'" + ) + tags_result = execute_command(tags_command, logger, type_json=True) + + if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result: + logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags_data = tags_result["stdout"] + if not isinstance(tags_data, dict) or "results" not in tags_data: + logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags = tags_data["results"] + tag_exists = False + + # Use the tag parameter if provided, otherwise fall back to checking package_content + tag_to_check = tag if tag else package_content + + for tag_item in tags: + if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check: + tag_exists = True + break + + if tag_exists: + logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.") + else: + logger.error(f"Sync completed but repository version did not change for {repo_name}. " + f"Version remained at {version_after}. " + f"Tag '{tag_to_check}' does not exist in Pulp repository content. " + f"This indicates the tag likely does not exist in the upstream registry.") + return False + + except Exception as e: + logger.error( + f"Error checking repository tag existence: {e}. Assuming tag doesn't exist." + ) + return False + + logger.info( + f"Sync validation successful: repository {repo_name} version changed " + f"from {version_before} to {version_after}" + ) + result = create_container_distribution(repo_name, package_content, logger) + return result except Exception as e: logger.error(f"Failed to synchronize repository {repo_name} with remote {remote_name}. Error: {e}") return False diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py index c8d8bd1339..892725b207 100644 --- a/common/library/module_utils/local_repo/download_common.py +++ b/common/library/module_utils/local_repo/download_common.py @@ -35,6 +35,7 @@ from ansible.module_utils.local_repo.common_functions import load_pulp_config from ansible.module_utils.local_repo.config import ( pulp_file_commands, + pulp_rpm_commands, CLI_FILE_PATH, POST_TIMEOUT, ISO_POLL_VAL, @@ -477,7 +478,7 @@ def process_manifest(file,repo_store_path, status_file_path, cluster_os_type, cl manifest_directory = os.path.join(repo_store_path, "offline_repo", "cluster",arc.lower(), cluster_os_type, cluster_os_version, "manifest", package_name) # # Determine the manifest file path file_path = os.path.join(manifest_directory, f"{package_name}.yaml") - repository_name = "manifest" + package_name + repository_name = arc.lower() + "_manifest" + package_name output_file = package_name + ".yml" relative_path = output_file base_path = manifest_directory.strip("/") @@ -531,7 +532,7 @@ def process_git(file,repo_store_path, status_file_path, cluster_os_type, cluster clone_directory = os.path.join(git_modules_directory, package_name) clone_directory = shlex.quote(clone_directory).strip("'\"") tarball_path = os.path.join(git_modules_directory, f'{package_name}.tar.gz') - repository_name = "git" + package_name + repository_name = arc.lower() + "_git" + package_name output_file = package_name + ".tar.gz" relative_path = output_file base_path = git_modules_directory.strip("/") @@ -600,7 +601,7 @@ def process_shell(file,repo_store_path, status_file_path, cluster_os_type, clus os.makedirs(sh_directory, exist_ok=True) # Ensure the directory exists sh_path = os.path.join(sh_directory, f"{package_name}.sh") - repository_name = "shell" + package_name + repository_name = arc.lower() + "_shell" + package_name output_file = package_name + ".sh" relative_path = output_file base_path = sh_directory.strip("/") @@ -651,7 +652,7 @@ def process_ansible_galaxy_collection(file, repo_store_path, status_file_path, c galaxy_collections_directory = shlex.quote(galaxy_collections_directory).strip("'\"") os.makedirs(galaxy_collections_directory, exist_ok=True) # Ensure the directory exists collections_tarball_path = os.path.join(galaxy_collections_directory, f'{package_name.replace(".", "-")}-{version}.tar.gz') - repository_name = "ansible_galaxy_collection" + package_name + repository_name = arc.lower() + "_ansible_galaxy_collection" + package_name output_file = f"{file['package'].replace('.', '-')}-{file['version']}.tar.gz" relative_path = output_file base_path = galaxy_collections_directory.strip("/") @@ -758,7 +759,7 @@ def process_tarball(package, repo_store_path, status_file_path, version_variable tarball_path = os.path.join(tarball_directory, f"{package_name}.tar.gz") tarball_path = shlex.quote(tarball_path).strip("'\"") - repository_name = "tarball" + package_name + repository_name = arc.lower() + "_tarball" + package_name output_file = package_name + ".tar.gz" relative_path = output_file base_path = tarball_directory.strip("/") @@ -844,7 +845,7 @@ def process_iso(package, repo_store_path, status_file_path, url_support = True package_name = package['package'] package_type = package['type'] - repository_name = "iso" + package_name + arc + repository_name = arc.lower() + "_iso" + package_name distribution_name = repository_name if 'url' in package: @@ -941,7 +942,7 @@ def process_pip(package, repo_store_path, status_file_path, cluster_os_type, cl package_name = shlex.quote(package['package']).strip("'\"") package_type = package['type'] version = package.get('version', None) - pip_repo = "pip_module" + package_name + pip_repo = arc.lower() + "_pip_module" + package_name distribution_name = pip_repo logger.info(f"Processing Pip Package: {package_name}, Version: {version}") @@ -1023,3 +1024,156 @@ def process_pip(package, repo_store_path, status_file_path, cluster_os_type, cl logger.info("#" * 30 + f" {process_pip.__name__} end " + "#" * 30) return status + +def process_rpm_file(package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, arc, logger): + """ + Process an RPM file package by downloading it and setting up a Pulp RPM repository. + + Args: + package (dict): A dictionary containing the package information. + repo_store_path (str): The path to the repository store. + status_file_path (str): The path to the status file. + cluster_os_type (str): The type of the cluster operating system. + cluster_os_version (str): The version of the cluster operating system. + arc (str): The architecture (x86_64 or aarch64). + logger (logging.Logger): The logger instance. + + Returns: + str: The status of the RPM file package processing. + """ + logger.info("#" * 30 + f" {process_rpm_file.__name__} start " + "#" * 30) + + try: + package_name = package['package'] + url = package.get('url', None) + package_type = package['type'] + repo_name = arc.lower() + "_" + package_name + + if not url: + logger.error(f"No URL provided for RPM file package: {package_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + url = shlex.quote(url).strip("'\"") + logger.info(f"Processing RPM File Package: {package_name}, URL: {url}") + + # Create rpm_file directory structure + rpm_file_directory = os.path.join( + repo_store_path, "offline_repo", "cluster", arc.lower(), + cluster_os_type, cluster_os_version, "rpm_file", package_name + ) + os.makedirs(rpm_file_directory, exist_ok=True) + + # Extract filename from URL + download_file_name = url.split('/')[-1] + rpm_file_path = os.path.join(rpm_file_directory, download_file_name) + + # Step 1: Download the RPM file + logger.info("Step 1: Downloading RPM file...") + if os.path.exists(rpm_file_path): + logger.info(f"RPM file already exists: {rpm_file_path}") + else: + # Verify URL exists + subprocess.run(['wget', '-q', '--spider', '--tries=1', url], check=True) + + # Download the file + download_command = f"wget -O {shlex.quote(rpm_file_path)} {url}" + if not execute_command(download_command, logger): + logger.error(f"Failed to download RPM file from: {url}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 2: CREATE A NEW RPM REPOSITORY IN PULP (if it doesn't exist) + logger.info("Step 2: Creating RPM repository in Pulp...") + # Check if repository already exists + if execute_command(pulp_rpm_commands["show_repository"] % repo_name, logger): + logger.info(f"RPM repository {repo_name} already exists. Skipping creation.") + else: + logger.info(f"Creating RPM repository: {repo_name}") + if not execute_command(pulp_rpm_commands["create_repository"] % repo_name, logger): + logger.error(f"Failed to create RPM repository: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 3: UPLOAD THE RPM INTO THE REPO + logger.info("Step 3: Uploading RPM to repository...") + upload_command = pulp_rpm_commands["upload_content"] % (repo_name, shlex.quote(rpm_file_path)) + if not execute_command(upload_command, logger): + logger.error(f"Failed to upload RPM to repository: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 4: PUBLISH THE REPOSITORY + logger.info("Step 4: Publishing repository...") + if not execute_command(pulp_rpm_commands["publish_repository"] % repo_name, logger): + logger.error(f"Failed to publish repository: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 5: CREATE A DISTRIBUTION FOR THE REPO (if it doesn't exist) + logger.info("Step 5: Creating distribution...") + + # Check if distribution already exists + if execute_command(pulp_rpm_commands["check_distribution"] % repo_name, logger): + logger.info(f"Distribution {repo_name} already exists. Skipping creation.") + else: + logger.info(f"Creating distribution: {repo_name}") + # Get the publication href + pub_result = execute_command(pulp_rpm_commands["list_all_publications"], logger, type_json=True) + if not pub_result or not pub_result.get("stdout"): + logger.error("Failed to get publication list") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + publications = pub_result["stdout"] + if not publications: + logger.error("No publications found") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + latest_publication = publications[0] + publication_href = latest_publication.get("pulp_href") + + if not publication_href: + logger.error("No publication href found") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + base_path = f" opt/omnia/offline_repo/cluster/{arc}/rhel/10.0/rpms/{repo_name}" + dist_create_command = pulp_rpm_commands["distribute_repository"] % (repo_name, base_path, repo_name) + if not execute_command(dist_create_command, logger): + logger.error(f"Failed to create distribution: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 6: ENABLE AUTO-GENERATION OF .repo FILES + logger.info("Step 6: Enabling auto-generation of .repo files...") + update_command = pulp_rpm_commands["update_distribution_repo_config"] % repo_name + if not execute_command(update_command, logger): + logger.warning(f"Failed to enable repo config generation for: {repo_name}") + # Not a critical failure, continue + + logger.info(f"RPM file package {package_name} processed successfully!") + status = "Success" + + except subprocess.CalledProcessError as e: + logger.error(f"Error executing RPM file commands: {e}") + status = "Failed" + except Exception as e: + logger.error(f"Error processing RPM file package: {e}") + status = "Failed" + + finally: + # Write the status to the file + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + logger.info("#" * 30 + f" {process_rpm_file.__name__} end " + "#" * 30) + return status \ No newline at end of file diff --git a/common/library/module_utils/local_repo/download_image.py b/common/library/module_utils/local_repo/download_image.py index c9b3020a7b..98a1cb5b66 100644 --- a/common/library/module_utils/local_repo/download_image.py +++ b/common/library/module_utils/local_repo/download_image.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -206,22 +206,61 @@ def get_repo_url_and_content(package): ValueError: If the package prefix is not supported. """ patterns = { - r"^(ghcr\.io)(/.+)": "https://ghcr.io", - r"^(docker\.io)(/.+)": "https://registry-1.docker.io", - r"^(quay\.io)(/.+)": "https://quay.io", - r"^(registry\.k8s\.io)(/.+)": "https://registry.k8s.io", - r"^(nvcr\.io)(/.+)": "https://nvcr.io", - r"^(public\.ecr\.aws)(/.+)": "https://public.ecr.aws", - r"^(gcr\.io)(/.+)": "https://gcr.io" + r"^(ghcr\.io)(:\d+)?(/.+)": "https://ghcr.io", + r"^(docker\.io)(:\d+)?(/.+)": "https://registry-1.docker.io", + r"^(quay\.io)(:\d+)?(/.+)": "https://quay.io", + r"^(registry\.k8s\.io)(:\d+)?(/.+)": "https://registry.k8s.io", + r"^(nvcr\.io)(:\d+)?(/.+)": "https://nvcr.io", + r"^(public\.ecr\.aws)(:\d+)?(/.+)": "https://public.ecr.aws", + r"^(gcr\.io)(:\d+)?(/.+)": "https://gcr.io", } for pattern, repo_url in patterns.items(): match = re.match(pattern, package) if match: base_url = repo_url - package_content = match.group(2).lstrip("/") # Remove leading slash + + # If user provided a port, preserve it + if match.group(2): + base_url = f"{repo_url}{match.group(2)}" + + package_content = match.group(3).lstrip("/") return base_url, package_content - raise ValueError(f"Unsupported package prefix for package: {package}") + # fallback for private / IP-based registries + match = re.match(r"^(?P[^/]+)(?P/.*)$", package) + if match: + return f"https://{match.group('registry')}", match.group("path").lstrip("/") + + raise ValueError(f"Invalid package format: {package}") + + +# def get_repo_url_and_content(package): +# """ +# Get the repository URL and content from a given package. +# Parameters: +# package (str): The package to extract the URL and content from. +# Returns: +# tuple: A tuple containing the repository URL and content. +# Raises: +# ValueError: If the package prefix is not supported. +# """ +# patterns = { +# r"^(ghcr\.io)(/.+)": "https://ghcr.io", +# r"^(docker\.io)(/.+)": "https://registry-1.docker.io", +# r"^(quay\.io)(/.+)": "https://quay.io", +# r"^(registry\.k8s\.io)(/.+)": "https://registry.k8s.io", +# r"^(nvcr\.io)(/.+)": "https://nvcr.io", +# r"^(public\.ecr\.aws)(/.+)": "https://public.ecr.aws", +# r"^(gcr\.io)(/.+)": "https://gcr.io" +# } +# for pattern, repo_url in patterns.items(): +# match = re.match(pattern, package) +# if match: +# base_url = repo_url +# package_content = match.group(2).lstrip("/") # Remove leading slash +# return base_url, package_content + +# raise ValueError(f"Unsupported package prefix for package: {package}") def process_image(package, status_file_path, version_variables, user_registries,docker_username, docker_password, logger): @@ -245,66 +284,81 @@ def process_image(package, status_file_path, version_variables, base_url, package_content = get_repo_url_and_content(package['package']) package_identifier = None + # Only check user registries for additional_packages + if user_registries and "additional_packages" in status_file_path: + result, package_identifier = handle_user_image_registry( + package, + package_content, + version_variables, + user_registries, + logger + ) - if user_registries: - result, package_identifier = handle_user_image_registry(package, package_content, - version_variables, user_registries, logger) - # If user registry not found or no user registry given, proceed with public registry - if not result: - try: - repo_name_prefix = "container_repo_" - repository_name = f"{repo_name_prefix}{package['package'].replace('/', '_').replace(':', '_')}" - remote_name = f"remote_{package['package'].replace('/', '_')}" - package_identifier = package['package'] - # Create container repository - with repository_creation_lock: - result = create_container_repository(repository_name, logger) + if not result: + logger.info(f"Image {package['package']} will not be synced to Pulp.") + status = "Failed" + return status + + else: + logger.info(f"Image {package['package']} synced to Pulp.") + status = "Success" + return status + + try: + repo_name_prefix = "container_repo_" + repository_name = f"{repo_name_prefix}{package['package'].replace('/', '_').replace(':', '_')}" + remote_name = f"remote_{package['package'].replace('/', '_').replace(':', '_')}" + package_identifier = package['package'] + + # Create container repository + with repository_creation_lock: + result = create_container_repository(repository_name, logger) + if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): + raise Exception(f"Failed to create repository: {repository_name}") + + # Process digest or tag + if "digest" in package: + package_identifier += f":{package['digest']}" + result = create_container_remote_digest( + remote_name, base_url, package_content, policy_type, logger + ) if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to create repository: {repository_name}") - # Process digest or tag - if "digest" in package: - package_identifier += f":{package['digest']}" - result = create_container_remote_digest(remote_name, base_url, - package_content, policy_type, logger) - if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to create remote digest: {remote_name}") - - elif "tag" in package: - tag_template = Template(package['tag']) - tag_val = tag_template.render(**version_variables) - package_identifier += f":{package['tag']}" - - # Only use auth for docker.io images - if package['package'].startswith('docker.io/'): - - with remote_creation_lock: - if docker_username and docker_password: - result = create_container_remote_with_auth( - remote_name, base_url, package_content, policy_type, - tag_val, logger, docker_username, docker_password - ) - else: - result = create_container_remote( - remote_name, base_url, package_content, policy_type, tag_val, logger - ) + raise Exception(f"Failed to create remote digest: {remote_name}") + + elif "tag" in package: + tag_template = Template(package['tag']) + tag_val = tag_template.render(**version_variables) + package_identifier += f":{package['tag']}" + + with remote_creation_lock: + if package['package'].startswith('docker.io/') and docker_username and docker_password: + result = create_container_remote_with_auth( + remote_name, base_url, package_content, policy_type, + tag_val, logger, docker_username, docker_password + ) else: - # For non-docker.io registries, use unauthenticated access - with remote_creation_lock: - result = create_container_remote( - remote_name, base_url, package_content, policy_type, tag_val, logger - ) - - if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to create remote: {remote_name}") - # Sync and distribute container repository - result = sync_container_repository(repository_name, remote_name, package_content,logger) + result = create_container_remote( + remote_name, base_url, package_content, policy_type, tag_val, logger + ) + if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to sync repository: {repository_name}") + raise Exception(f"Failed to create remote: {remote_name}") - except Exception as e: - status = "Failed" - logger.error(f"Failed to process image: {package_identifier}. Error: {e}") + # Sync and distribute + # Pass tag_val if it exists (for tag-based images), otherwise None (for digest-based images) + tag_to_pass = tag_val if "tag" in package else None + result = sync_container_repository( + repository_name, remote_name, package_content, logger, tag=tag_to_pass + ) + if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): + raise Exception(f"Failed to sync repository: {repository_name}") + + except Exception as e: + status = "Failed" + logger.error(f"Failed to process image: {package_identifier}. Error: {e}") - write_status_to_file(status_file_path, package_identifier, package['type'], status, logger, file_lock) + write_status_to_file( + status_file_path, package_identifier, package['type'], status, logger, file_lock + ) logger.info("#" * 30 + f" {process_image.__name__} end " + "#" * 30) return status diff --git a/common/library/module_utils/local_repo/download_rpm.py b/common/library/module_utils/local_repo/download_rpm.py index 0b7bc2a0e6..44b56c1799 100644 --- a/common/library/module_utils/local_repo/download_rpm.py +++ b/common/library/module_utils/local_repo/download_rpm.py @@ -20,7 +20,8 @@ import shutil from pathlib import Path from ansible.module_utils.local_repo.config import ( - DNF_COMMANDS + DNF_COMMANDS, + DNF_INFO_COMMANDS ) from multiprocessing import Lock from ansible.module_utils.local_repo.parse_and_download import write_status_to_file @@ -49,6 +50,9 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, logger.info("#" * 30 + f" {process_rpm.__name__} start " + "#" * 30) try: + # Get repo_mapping for individual RPM repo names + repo_mapping = package.get("repo_mapping", {}) + if repo_config_value == "always": rpm_list = list(set(package["rpm_list"])) logger.info(f"{package['package']} - List of rpms is {rpm_list}") @@ -90,11 +94,32 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, # Detect successes/failures from combined run for pkg in rpm_list: - if any(pkg in line and ".rpm" in line for line in stdout_lines + stderr_lines): + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") + # Check if package was downloaded successfully + # Look for "Already downloaded" or actual .rpm file in output + pkg_downloaded = False + for line in stdout_lines + stderr_lines: + if pkg in line and (".rpm" in line or "Already downloaded" in line): + pkg_downloaded = True + break + + # Also check for "No match for argument" or "No package" errors + pkg_not_found = False + for line in stderr_lines: + if pkg in line and ("No match for argument" in line or + "No package" in line or + "not found" in line.lower()): + pkg_not_found = True + break + + if pkg_downloaded and not pkg_not_found: downloaded.append(pkg) - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock) + write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) else: failed.append(pkg) + if pkg_not_found: + logger.warning(f"Package '{pkg}' not found in configured repositories") # Retry failed ones individually if failed: @@ -102,15 +127,29 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, for pkg in failed[:]: cmd = DNF_COMMANDS[arch_key] + [f'--destdir={rpm_directory}', pkg] retry_res = subprocess.run(cmd, check=False, capture_output=True, text=True) + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") + + # Check for package not found errors + retry_stderr = retry_res.stderr.lower() + pkg_invalid = any(err in retry_stderr for err in [ + "no match for argument", + "no package", + "not found", + "unable to find a match" + ]) if retry_res.returncode == 0 and ".rpm" in retry_res.stdout + retry_res.stderr: downloaded.append(pkg) failed.remove(pkg) - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock) + write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) logger.info(f"Package '{pkg}' downloaded successfully on retry.") else: - write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock) - logger.error(f"Package '{pkg}' still failed after retry.") + write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name) + if pkg_invalid: + logger.error(f"Package '{pkg}' does not exist in configured repositories.") + else: + logger.error(f"Package '{pkg}' still failed after retry.") # Determine final status if not failed: @@ -121,16 +160,67 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, status = "Failed" else: - status = "Success" logger.info("RPM won't be downloaded when repo_config is partial or never") + logger.info("Validating package availability using dnf info...") + + arch_key = "x86_64" if arc.lower() in ("x86_64") else "aarch64" + valid_packages = [] + invalid_packages = [] + for pkg in package["rpm_list"]: - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock) + # Validate package using dnf info + dnf_info_command = DNF_INFO_COMMANDS[arch_key] + [ + "--repo=*", # Search all enabled repositories + pkg + ] + result = subprocess.run( + dnf_info_command, + check=False, + capture_output=True, + text=True + ) + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") + if result.returncode == 0: + # Package exists and is available + valid_packages.append(pkg) + write_status_to_file( + status_file_path, pkg, "rpm", "Success", + logger, file_lock, pkg_repo_name + ) + logger.info(f"Package '{pkg}' validated successfully") + else: + # Package not found or invalid + invalid_packages.append(pkg) + write_status_to_file( + status_file_path, pkg, "rpm", "Failed", + logger, file_lock, pkg_repo_name + ) + logger.error( + f"Package '{pkg}' validation failed. " + f"Package may not exist in configured repositories." + ) + + # Determine final status based on validation results + if not invalid_packages: + status = "Success" + elif valid_packages: + status = "Partial" + else: + status = "Failed" + + logger.info( + f"Validation complete - Valid: {len(valid_packages)}, " + f"Invalid: {len(invalid_packages)}" + ) except Exception as e: logger.error(f"Exception occurred: {e}") status = "Failed" for pkg in package.get("rpm_list", []): - write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock) + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") + write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name) finally: logger.info(f"Overall status for {package['package']}: {status}") diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 8874621f0c..d5192e2bbe 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # pylint: disable=import-error,no-name-in-module +""" +Utility functions for parsing and downloading artifacts. + +This module provides common functions for command execution, status file management, +and repository operations used across the local repo management system. +""" + import os import subprocess import json import re from multiprocessing import Lock -from ansible.module_utils.local_repo.standard_logger import setup_standard_logger +from ansible.module_utils.local_repo.config import ARCH_SUFFIXES, STATUS_CSV_HEADER def mask_sensitive_data(cmd_string): @@ -57,7 +64,6 @@ def execute_command(cmd_string, logger, type_json=False): stderr=subprocess.PIPE, shell=True, ) - status["returncode"] = cmd.returncode status["stdout"] = cmd.stdout.strip() if cmd.stdout else None status["stderr"] = cmd.stderr.strip() if cmd.stderr else None @@ -67,53 +73,159 @@ def execute_command(cmd_string, logger, type_json=False): logger.error(f"Error: {status['stderr']}") return False - if type_json and status["stdout"]: + if type_json: + if not status["stdout"]: + logger.error("Command succeeded but returned empty output when JSON was expected") + return False try: status["stdout"] = json.loads(status["stdout"]) except json.JSONDecodeError as error: logger.error(f"Failed to parse JSON output: {error}") + logger.error(f"Raw output was: {status['stdout']}") return False + logger.info(f"Command succeeded: {safe_cmd_string}") return status - - except Exception as error: - logger.error(f"Error executing command: {error}") + except subprocess.CalledProcessError as e: + logger.error(f"Command failed: {safe_cmd_string} - {e}") + return False + except subprocess.TimeoutExpired as e: + logger.error(f"Command timed out: {safe_cmd_string} - {e}") + return False + except OSError as e: + logger.error(f"OS error during command: {safe_cmd_string} - {e}") return False finally: logger.info("#" * 30 + f" {execute_command.__name__} end " + "#" * 30) -def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock): +def get_arch_from_status_path(status_file_path): + """Extract architecture from status file path. + + Args: + status_file_path: Path like '/opt/omnia/log/local_repo/x86_64/software_name/status.csv' + + Returns: + str: Architecture ('x86_64' or 'aarch64') or None if not found + """ + for arch in ARCH_SUFFIXES: + if f"/{arch}/" in status_file_path: + return arch + return None + +def _prefix_repo_name_with_arch(repo_name: str, status_file_path: str, logger) -> str: + """Add architecture prefix to repo_name if not already present. + + Args: + repo_name: Repository name to prefix + status_file_path: Path to extract architecture from + logger: Logger instance + + Returns: + str: Repository name with architecture prefix """ - Writes or updates the status of a package in the status file, using a lock to ensure safe access across processes. + if not repo_name: + return repo_name + + arch = get_arch_from_status_path(status_file_path) + if arch and not any(repo_name.startswith(f"{prefix}_") for prefix in ARCH_SUFFIXES): + prefixed_name = f"{arch}_{repo_name}" + logger.info(f"Auto-prefixed repo_name with architecture: {prefixed_name}") + return prefixed_name + return repo_name + + +def _update_existing_line(line: str, package_name: str, package_type: str, status: str, repo_name: str, status_file_path: str) -> str: + """Update an existing line in status file. + + Args: + line: Existing line content + package_name: Package name to match + package_type: Package type + status: New status + repo_name: Repository name + status_file_path: Path for architecture extraction + + Returns: + str: Updated line content + """ + parts = line.strip().split(',') + if len(parts) >= 4: + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + parts[2] = final_repo_name if final_repo_name else '' + parts[3] = status + return ','.join(parts) + '\n' + + # Handle short lines + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + return f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n" + + +def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock, repo_name=None): + """ + Writes or updates the status of a package in the status file. + + Args: + status_file_path: Path to the status file + package_name: Name of the package + package_type: Type of the package (rpm, image, etc.) + status: Status (Success, Failed, etc.) + logger: Logger instance + file_lock: Lock for thread safety + repo_name: Optional repository name (for RPMs) """ logger.info("#" * 30 + f" {write_status_to_file.__name__} start " + "#" * 30) + # Auto-prefix repo_name with architecture if needed + repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, logger) + try: with file_lock: # Ensure only one process can write at a time if os.path.exists(status_file_path): - with open(status_file_path, "r") as f: - lines = f.readlines() - - updated = False - with open(status_file_path, "w") as f: - for line in lines: - if line.startswith(f"{package_name},"): - f.write(f"{package_name},{package_type},{status}\n") - updated = True - else: - f.write(line) - - if not updated: - f.write(f"{package_name},{package_type},{status}\n") + _update_existing_file(status_file_path, package_name, package_type, status, repo_name) else: - with open(status_file_path, "w") as f: - f.write("name,type,status\n") - f.write(f"{package_name},{package_type},{status}\n") + _create_new_file(status_file_path, package_name, package_type, status, repo_name) logger.info(f"Status written to {status_file_path} for {package_name}.") - except Exception as e: + except OSError as e: logger.error(f"Failed to write to status file: {status_file_path}. Error: {str(e)}") - raise RuntimeError(f"Failed to write to status file: {status_file_path}. Error: {str(e)}") + raise RuntimeError( + f"Failed to write to status file: {status_file_path}. Error: {str(e)}" + ) from e finally: logger.info("#" * 30 + f" {write_status_to_file.__name__} end " + "#" * 30) + + +def _update_existing_file(status_file_path, package_name, package_type, status, repo_name): + """Update existing status file with new package status.""" + with open(status_file_path, "r", encoding='utf-8') as f: + lines = f.readlines() + + updated = False + with open(status_file_path, "w", encoding='utf-8') as f: + # Write header + if lines: + f.write(lines[0]) + + # Write data lines + for line in lines[1:]: # Skip header + if line.startswith(f"{package_name},"): + updated_line = _update_existing_line( + line, package_name, package_type, status, repo_name, status_file_path + ) + f.write(updated_line) + updated = True + else: + f.write(line) + + if not updated: + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n") + + +def _create_new_file(status_file_path, package_name, package_type, status, repo_name): + """Create new status file with package status.""" + with open(status_file_path, "w", encoding='utf-8') as f: + f.write(STATUS_CSV_HEADER) + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n") diff --git a/common/library/module_utils/local_repo/process_parallel.py b/common/library/module_utils/local_repo/process_parallel.py index b1c0f0b91b..2c55098c98 100644 --- a/common/library/module_utils/local_repo/process_parallel.py +++ b/common/library/module_utils/local_repo/process_parallel.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,8 +34,8 @@ from ansible.module_utils.local_repo.config import ( OMNIA_CREDENTIALS_YAML_PATH, OMNIA_CREDENTIALS_VAULT_PATH, - USER_REG_CRED_INPUT, - USER_REG_KEY_PATH + # USER_REG_CRED_INPUT, + # USER_REG_KEY_PATH ) # Global lock for logging synchronization log_lock = multiprocessing.Lock() @@ -96,7 +96,7 @@ def load_docker_credentials(vault_yml_path, vault_password_file): if response.status_code == 200: return docker_username, docker_password - + if response.status_code == 429: raise RuntimeError("Docker Hub rate limit exceeded. Please try again later.") @@ -201,6 +201,13 @@ def execute_task(task, determine_function, user_data, version_variables, arc, with log_lock: logger.info(f"### {execute_task.__name__} start ###") # Log task start + # Build package display name with tag for images + package_display = task.get("package", "") + if task.get("type") == "image" and "tag" in task: + package_display = f"{package_display}:{task['tag']}" + elif task.get("type") == "image" and "digest" in task: + package_display = f"{package_display}:{task['digest']}" + # Determine the function and its arguments using the provided `determine_function` function, args = determine_function(task, repo_store_path, csv_file_path, user_data, version_variables, arc, user_registries, docker_username, docker_password) @@ -217,7 +224,7 @@ def execute_task(task, determine_function, user_data, version_variables, arc, ) return { "task": task, - "package": task.get("package", ""), # Extract package name if available + "package": package_display, "status": "TIMEOUT", "output": "", "error": f"Timeout reached after {elapsed_time:.2f}s" @@ -240,7 +247,7 @@ def execute_task(task, determine_function, user_data, version_variables, arc, return { "task": task, - "package": task.get("package", ""), + "package": package_display, "status": result.upper(), "output": result, "error": "" @@ -251,12 +258,11 @@ def execute_task(task, determine_function, user_data, version_variables, arc, logger.error(f"Task failed: {str(e)}") return { "task": task, - "package": task.get("package", ""), + "package": package_display, "status": "FAILED", "output": "", "error": str(e) # Include the error message } - def worker_process(task, determine_function, user_data, version_variables, arc, repo_store_path, csv_file_path, log_dir, result_queue, user_registries, docker_username, docker_password, timeout): @@ -321,8 +327,8 @@ def execute_parallel( arc, standard_logger, local_repo_config_path, - user_reg_cred_input, - user_reg_key_path, + # user_reg_cred_input, + # user_reg_key_path, omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout @@ -355,22 +361,22 @@ def execute_parallel( config = load_yaml_file(local_repo_config_path) user_registries = config.get("user_registry", []) - if user_registries: - if is_encrypted(user_reg_cred_input): - process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') - - file2_data = load_yaml_file(user_reg_cred_input) - cred_lookup = { - entry['name']: entry - for entry in file2_data.get('user_registry_credential', []) - } - # Update user_registry entries with credentials if required - for registry in user_registries: - if registry.get("requires_auth"): - creds = cred_lookup.get(registry.get("name")) - if creds: - registry["username"] = creds.get("username") - registry["password"] = creds.get("password") + # if user_registries: + # if is_encrypted(user_reg_cred_input): + # process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') + + # file2_data = load_yaml_file(user_reg_cred_input) + # cred_lookup = { + # entry['name']: entry + # for entry in file2_data.get('user_registry_credential', []) + # } + # # Update user_registry entries with credentials if required + # for registry in user_registries: + # if registry.get("requires_auth"): + # creds = cred_lookup.get(registry.get("name")) + # if creds: + # registry["username"] = creds.get("username") + # registry["password"] = creds.get("password") try: diff --git a/common/library/module_utils/local_repo/registry_utils.py b/common/library/module_utils/local_repo/registry_utils.py index 6974d963cb..2e7da2f659 100644 --- a/common/library/module_utils/local_repo/registry_utils.py +++ b/common/library/module_utils/local_repo/registry_utils.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,9 +13,29 @@ # limitations under the License. # pylint: disable=import-error,no-name-in-module import requests +import socket +import ssl from requests.auth import HTTPBasicAuth from ansible.module_utils.local_repo.common_functions import is_file_exists +def is_https(host, timeout=1): + ip, port = host.rsplit(":", 1) + port = int(port) + + # Don't verify server cert; just see if TLS works + context = ssl.create_default_context() + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + + try: + with socket.create_connection((ip, port), timeout=timeout) as sock: + with context.wrap_socket(sock, server_hostname=ip): + return True + except ssl.SSLError: + return False + except Exception: + return False + def validate_user_registry(user_registry): """ Validates a list of user registry entries with connectivity and credential check. @@ -34,64 +54,92 @@ def validate_user_registry(user_registry): host = item.get('host') if not host: return False, f"Missing or empty 'host' in entry at index {idx}: {item}" + https = is_https(host) - requires_auth = item.get('requires_auth', False) - - # Check basic username/password presence - if requires_auth: - if not item.get('username') or not item.get('password'): - return False, ( - f"'requires_auth' is true but 'username' or 'password' is missing or empty " - f"in entry for (host: {host})" - ) - - cert_path = item.get('cert_path') - key_path = item.get('key_path') - - if bool(cert_path) != bool(key_path): - return False, ( - f"If authentication is enabled, both 'cert_path' and 'key_path' must be present " - f"or both omitted in entry for (host: {host})" - ) - try: - url = f"https://{host}/api/v2.0/users/current" - response = requests.get( - url, - auth=HTTPBasicAuth(item['username'], item['password']), - verify=True # Set to True if using valid SSL certs - ) - - if response.status_code == 401: - return False, f"Invalid credentials for host: {host}" - elif response.status_code != 200: - return False, f"Unexpected status {response.status_code} while validating host: {host}" - - except requests.exceptions.RequestException as e: - return False, f"Failed to connect to {host}: {str(e)}" + cert_path = (item.get("cert_path") or "").strip() + key_path = (item.get("key_path") or "").strip() + + if https and (not cert_path or not key_path): + return False, f"{host} is an HTTPS registry and requires cert_path and key_path. Please provide cert_path and key_path in local_repo_config.yml under user_registry section" return True, "" -def check_reachability(user_registry, timeout): + # requires_auth = item.get('requires_auth', False) + + # # Check basic username/password presence + # if requires_auth: + # if not item.get('username') or not item.get('password'): + # return False, ( + # f"'requires_auth' is true but 'username' or 'password' is missing or empty " + # f"in entry for (host: {host})" + # ) + + # cert_path = item.get('cert_path') + # key_path = item.get('key_path') + + # if bool(cert_path) != bool(key_path): + # return False, ( + # f"If authentication is enabled, both 'cert_path' and 'key_path' must be present " + # f"or both omitted in entry for (host: {host})" + # ) + # try: + # url = f"https://{host}/api/v2.0/users/current" + # response = requests.get( + # url, + # auth=HTTPBasicAuth(item['username'], item['password']), + # verify=True # Set to True if using valid SSL certs + # ) + + # if response.status_code == 401: + # return False, f"Invalid credentials for host: {host}" + # elif response.status_code != 200: + # return False, f"Unexpected status {response.status_code} while validating host: {host}" + + # except requests.exceptions.RequestException as e: + # return False, f"Failed to connect to {host}: {str(e)}" + + # return True, "" + +def tcp_ping(host, timeout=1): """ - Checks the reachability of hosts in the user registry. - + Check if a host:port is reachable via TCP. + Args: - user_registry (list): A list of dictionaries representing user registry entries. - timeout (int): The maximum number of seconds to wait for a response. - + host (str): User registry host with port + timeout (int): Timeout in seconds + Returns: + bool: True if reachable, False otherwise + """ + try: + if ":" in host: + hostname, port = host.split(":") + port = int(port) + else: + hostname = host + port = 443 + + with socket.create_connection((hostname, port), timeout=timeout): + return True + except Exception: + return False + +def check_reachability(user_registry, timeout=1): + """ + Check reachability of hosts in a user registry. + + Args: + user_registry (list): List of dicts, each with a 'host' key + timeout (int): TCP connection timeout in seconds Returns: - tuple: A tuple containing two lists: reachable hosts and unreachable hosts. + tuple: (reachable_hosts, unreachable_hosts) """ reachable, unreachable = [], [] for item in user_registry: - try: - resp = requests.get(f"https://{item['host']}", timeout=timeout, verify=True) - if resp.status_code == 200: - reachable.append(item['host']) - else: - unreachable.append(item['host']) - except Exception: - unreachable.append(item['host']) + host = item['host'] + if tcp_ping(host, timeout): + reachable.append(host) + else: + unreachable.append(host) return reachable, unreachable def find_invalid_cert_paths(user_registry): diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index e64479209b..3e06ddc7cd 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ import requests from ansible.module_utils.local_repo.standard_logger import setup_standard_logger from ansible.module_utils.local_repo.common_functions import is_encrypted, process_file, get_arch_from_sw_config +from ansible.module_utils.local_repo.parse_and_download import execute_command # Import default variables from config.py from ansible.module_utils.local_repo.config import ( PACKAGE_TYPES, @@ -37,7 +38,8 @@ SOFTWARES_KEY, REPO_CONFIG, ARCH_SUFFIXES, - ADDITIONAL_REPOS_KEY + ADDITIONAL_REPOS_KEY, + pulp_container_commands ) @@ -174,21 +176,32 @@ def transform_package_dict(data, arch_val,logger): for sw_name, items in data.items(): transformed_items = [] rpm_packages = [] + repo_mapping = {} for item in items: - if item.get("type") == "rpm": + if item.get("type") in ("rpm", "rpm_repo"): rpm_packages.append(item["package"]) + # Preserve repo_name if available + if "repo_name" in item: + repo_mapping[item["package"]] = item["repo_name"] elif item.get("type") == "rpm_list": rpm_packages.extend(item["package_list"]) + # Preserve repo_mapping if available + if "repo_mapping" in item: + repo_mapping.update(item["repo_mapping"]) else: transformed_items.append(item) if rpm_packages: - transformed_items.append({ + rpm_task = { "package": RPM_LABEL_TEMPLATE.format(key=sw_name), "rpm_list": rpm_packages, "type": "rpm" - }) + } + # Add repo_mapping if we have any + if repo_mapping: + rpm_task["repo_mapping"] = repo_mapping + transformed_items.append(rpm_task) result[arch_val][sw_name] = transformed_items logger.info(f"Finished processing %s. Result: %s", sw_name, transformed_items) @@ -228,7 +241,7 @@ def parse_repo_urls(repo_config, local_repo_config_path, logger.info(f"Processing repository URLs for architectures: {archs_to_process}") for arch in archs_to_process: - + # Always ensure these are lists rhel_repo_entry[arch] = list(local_yaml.get(f"rhel_os_url_{arch}") or []) repo_entries[arch] = list(local_yaml.get(f"omnia_repo_url_rhel_{arch}") or []) @@ -336,8 +349,8 @@ def parse_repo_urls(repo_config, local_repo_config_path, seen_urls = set() for arch, entries in repo_entries.items(): if not entries: - logger.info(f"No OMNIA repository entries found for {arch}") - continue + logger.info(f"No OMNIA repository entries found for {arch}") + continue for repo in entries: name = repo.get("name", "unknown") @@ -453,7 +466,7 @@ def get_subgroup_dict(user_data,logger): for item in user_data.get(software_name, [])] subgroup_dict[software_name] = subgroups if isinstance( user_data.get(software_name), list) else [sw['name']] - + logger.info("Completed get_subgroup_dict(). Found %d software entries.", len(software_names)) logger.info("Final subgroup_dict: %s", subgroup_dict) @@ -477,17 +490,17 @@ def get_csv_software(file_name): """ csv_software = [] - + if not os.path.isfile(file_name): return csv_software - + with open(file_name, mode='r') as csv_file: reader = csv.DictReader(csv_file) csv_software = [row.get(CSV_COLUMNS["column1"], "").strip() for row in reader] return csv_software - + def get_failed_software(file_path): """ @@ -513,6 +526,81 @@ def get_failed_software(file_path): ] return failed_software +def check_additional_image_in_pulp(image_entry, logger): + """ + Checks if image present in additional_packages.json is configured in Pulp. + """ + image_name = image_entry.get("package") + image_tag = image_entry.get("tag", None) + image_digest = image_entry.get("digest", None) + + logger.info("Checking if %s is present in Pulp", image_name) + + dist_name_prefix = "container_repo_" + transformed_dist_name = (f"{dist_name_prefix}{image_name.replace('/', '_').replace(':', '_')}") + + repo_href_result = None + latest_version_href_result = None + tags_output_result = None + + show_dist_cmd = (pulp_container_commands["container_distribution_show"] % transformed_dist_name) + repo_href_result = execute_command(show_dist_cmd, logger) + logger.info("repo_href_result: %s", repo_href_result) + + if repo_href_result.get("stderr") and "Error:" in repo_href_result.get("stderr", ""): + logger.info("Distribution %s not found in Pulp", transformed_dist_name) + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + else: + logger.info("Distribution %s found in Pulp", transformed_dist_name) + repo_href = repo_href_result["stdout"] + show_repo_cmd = (pulp_container_commands["show_repository_version"] % repo_href) + latest_version_href_result = execute_command(show_repo_cmd, logger) + logger.info("latest_version_href_result: %s", latest_version_href_result) + if latest_version_href_result.get("stderr") and "Error:" in latest_version_href_result.get("stderr", ""): + logger.info("No repository version found. Empty repository") + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + else: + logger.info("Repository version found in Pulp") + latest_version_href = latest_version_href_result["stdout"] + show_tags_cmd = (pulp_container_commands["list_image_tags"] % latest_version_href) + tags_output_result = execute_command(show_tags_cmd, logger, type_json=True) + logger.info("tags_output_result: %s", tags_output_result) + if tags_output_result.get("stderr") and "Error:" in tags_output_result.get("stderr", ""): + logger.info("No tags found for %s", image_name) + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + else: + logger.info("Tags found for %s", image_name) + tag_names = [tag["name"] for tag in tags_output_result.get("stdout", {}).get("results", [])] + logger.info("tag_names: %s", tag_names) + if image_tag and image_tag not in tag_names: + logger.info("Tag %s not found for image %s in Pulp", image_tag, image_name) + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + elif image_digest and image_digest not in tag_names: + logger.info("Digest %s not found for image %s in Pulp", image_digest, image_name) + return { + "type": "image", + "package": image_name, + "tag": image_digest, + } + else: + logger.info("No download required as image is already present in Pulp") + return {} def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_list=None): """ @@ -538,10 +626,25 @@ def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_ filtered_list = [] + # Check if file name is additional_packages.json + is_additional_packages = file_path.endswith("additional_packages.json") + logger.info("additional_packages present: %s", is_additional_packages) + for key, package in data.items(): if subgroup_list is None or key in subgroup_list: for value in package.values(): for item in value: + # For every image, check if it is present in Pulp + if is_additional_packages and item.get("type") == "image": + logger.info("Calling function to check %s existence in Pulp", item) + tag_missing_entry = check_additional_image_in_pulp(item, logger) + logger.info("tag_missing_entry: %s", tag_missing_entry) + if tag_missing_entry == {}: + continue + if tag_missing_entry: + filtered_list.append(tag_missing_entry) + continue + # Get package name pkg_name = item.get("package") @@ -610,7 +713,6 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger): raise names = [row['name'] for row in status_csv_content] - # Read all packages from JSON try: all_packages = parse_json_data(json_path, PACKAGE_TYPES, logger,None, subgroup_list) @@ -618,18 +720,23 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger): except Exception as e: logger.error("Failed to parse JSON file '%s': %s", json_path, e) raise - - for pkg in all_packages: + for pkg in all_packages: if pkg["type"] == "image": - pkg_prefix = pkg.get("package", "").strip() - prefix_found = any(name.startswith(f"{pkg_prefix}:") for name in names) - if not prefix_found: - new_packages.append(pkg) + # Check exact package:tag or package:digest combination + pkg_base = pkg.get("package", "").strip() + pkg_identifier = pkg_base + + if "tag" in pkg: + pkg_identifier += f":{pkg['tag']}" + elif "digest" in pkg: + pkg_identifier += f":{pkg['digest']}" + + if pkg_identifier not in names: + new_packages.append(pkg) else: if pkg.get("package") not in names: new_packages.append(pkg) - logger.info("New packages list: %s", new_packages) logger.info("Finished get_new_packages_not_in_status()") @@ -656,7 +763,7 @@ def process_software(software, fresh_installation, json_path, csv_path, subgroup failed_packages = None logger.info("Fresh installation detected — skipping failed package check.") else: - try: + try: failed_packages = None if fresh_installation else get_failed_software(csv_path) logger.info("Failed packages: %s", failed_packages) except Exception as e: @@ -674,7 +781,7 @@ def process_software(software, fresh_installation, json_path, csv_path, subgroup raise else: logger.info("No failed RPM packages found for: %s", software) - + # Parse main JSON data try: combined = parse_json_data( @@ -706,7 +813,7 @@ def get_software_names_and_arch(json_data, arch): sw_arch = sw_arch_dict[sw["name"]] if arch in sw_arch: result.append(sw["name"]) - + return result def remove_duplicates_from_trans(trans): @@ -725,7 +832,7 @@ def remove_duplicates_from_trans(trans): if group == "default_packages": # Handle nested rpm_list case for pkg in items: - if pkg.get("type") == "rpm" and "rpm_list" in pkg: + if pkg.get("type") in ("rpm", "rpm_repo") and "rpm_list" in pkg: pkg["rpm_list"] = list(dict.fromkeys(pkg["rpm_list"])) continue @@ -736,7 +843,9 @@ def remove_duplicates_from_trans(trans): type_ = item.get("type") if type_ == "image": - key = (item.get("package"), item.get("tag")) + # Use digest if present, otherwise use tag + identifier = item.get("digest") or item.get("tag") + key = (item.get("package"), identifier) elif type_ == "pip_module": key = item.get("package") @@ -747,7 +856,7 @@ def remove_duplicates_from_trans(trans): elif type_ == "git": key = (item.get("url"), item.get("version")) - elif type_ == "rpm" and "rpm_list" in item: + elif type_ in ("rpm", "rpm_repo") and "rpm_list" in item: item["rpm_list"] = list(dict.fromkeys(item["rpm_list"])) key = item.get("package") diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py index 5c818c2f75..e97e9411dd 100644 --- a/common/library/module_utils/local_repo/user_image_utility.py +++ b/common/library/module_utils/local_repo/user_image_utility.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,30 +44,39 @@ def check_image_in_registry( Check if a container image exists in a user registry using Docker Registry HTTP API v2. Args: - host (str): Registry hostname. - image (str): Image name (e.g., library/nginx). - tag (str): Image tag (e.g., 1.25.2-alpine). - cacert (str, optional): Path to the CA certificate file. - key (str, optional): Path to the client key file. - username (str, optional): Registry username. - password (str, optional): Registry password. - logger (logging.Logger): Logger instance. + host (str): Registry hostname (without protocol) + image (str): Image name + tag (str): Image tag + cacert (str, optional): Path to the CA certificate file for TLS authentication + key (str, optional): Path to the client key file for TLS authentication + username (str, optional): Registry username for basic authentication + password (str, optional): Registry password for basic authentication + logger (logging.Logger, optional): Logger instance for logging messages Returns: - bool: True if image exists, False otherwise. + bool: True if image exists, False otherwise """ - image_url = f"https://{host}/v2/{image}/manifests/{tag}" + + if not host.startswith(("http://", "https://")): + protocol = "https" if (cacert and key) else "http" + host = f"{protocol}://{host}" + image_url = f"{host}/v2/{image}/manifests/{tag}" logger.info(f"Checking image existence at: {image_url}") try: request_args = { - "verify": False, # Consider using 'verify=cacert' if using trusted certs "timeout": 10, + "verify": False, + "headers": { + "Accept": ( + "application/vnd.oci.image.manifest.v1+json," + "application/vnd.oci.image.index.v1+json," + "application/vnd.docker.distribution.manifest.v2+json," + "application/vnd.docker.distribution.manifest.list.v2+json" + ) + }, } - if username and password: - request_args["auth"] = HTTPBasicAuth(username, password) - if cacert and key: request_args["cert"] = (cacert, key) @@ -77,10 +86,21 @@ def check_image_in_registry( logger.info(f"Image '{image}:{tag}' exists in registry '{host}'") return True - logger.warning( - f"Image not found (HTTP {response.status_code}) in registry '{host}'" + if response.status_code == 404: + logger.info( + f"Image '{image}:{tag}' does not exist in registry '{host}'" + ) + return False + + logger.error( + f"Unexpected HTTP {response.status_code} while checking image " + f"'{image}:{tag}' in registry '{host}'" ) + except requests.exceptions.SSLError as e: + logger.error( + f"TLS error while connecting to registry '{host}': {e}" + ) except requests.RequestException as e: logger.exception(f"Network error while checking image: {e}") except Exception as e: @@ -115,15 +135,38 @@ def create_user_remote_container( bool or dict: True on success, False on failure, or a dict with command result. """ try: - ca_cert = f"@{cacert}" - client_key = f"@{key}" - if tag_val is None: remote_exists = execute_command( pulp_container_commands["show_container_remote"] % remote_name, logger ) if not remote_exists: - command = pulp_container_commands["create_user_remote_digest"] % ( + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + command = pulp_container_commands["create_user_remote_digest"] % ( + remote_name, + base_url, + package_content, + policy_type, + ca_cert, + client_key, + ) + else: + command = pulp_container_commands["create_container_remote_for_digest"] % ( + remote_name, + base_url, + package_content, + policy_type, + ) + result = execute_command(command, logger) + logger.info(f"Remote created successfully: {remote_name}") + return result + + logger.info(f"Remote {remote_name} already exists.") + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + command = pulp_container_commands["update_user_remote_digest"] % ( remote_name, base_url, package_content, @@ -131,19 +174,13 @@ def create_user_remote_container( ca_cert, client_key, ) - result = execute_command(command, logger) - logger.info(f"Remote created successfully: {remote_name}") - return result - - logger.info(f"Remote {remote_name} already exists.") - command = pulp_container_commands["update_user_remote_digest"] % ( - remote_name, - base_url, - package_content, - policy_type, - ca_cert, - client_key, - ) + else: + command = pulp_container_commands["update_remote_for_digest"] % ( + remote_name, + base_url, + package_content, + policy_type, + ) result = execute_command(command, logger) logger.info(f"Remote updated successfully: {remote_name}") return result @@ -154,15 +191,26 @@ def create_user_remote_container( ) if not remote_exists: - command = pulp_container_commands["create_user_remote_tag"] % ( - remote_name, - base_url, - package_content, - policy_type, - tag_val, - ca_cert, - client_key, - ) + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + command = pulp_container_commands["create_user_remote_tag"] % ( + remote_name, + base_url, + package_content, + policy_type, + tag_val, + ca_cert, + client_key, + ) + else: + command = pulp_container_commands["create_container_remote"] % ( + remote_name, + base_url, + package_content, + policy_type, + tag_val, + ) result = execute_command(command, logger) if result: logger.info(f"Remote '{remote_name}' created successfully.") @@ -183,15 +231,26 @@ def create_user_remote_container( new_tags = existing_tags + [tag_val] tags_json = json.dumps(new_tags) - update_command = pulp_container_commands["update_user_remote_tag"] % ( - remote_name, - base_url, - package_content, - policy_type, - tags_json, - ca_cert, - client_key, - ) + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + update_command = pulp_container_commands["update_user_remote_tag"] % ( + remote_name, + base_url, + package_content, + policy_type, + tags_json, + ca_cert, + client_key, + ) + else: + update_command = pulp_container_commands["update_container_remote"] % ( + remote_name, + base_url, + package_content, + policy_type, + tags_json, + ) result = execute_command(update_command, logger) if result: @@ -234,10 +293,13 @@ def process_user_registry( repository_name = ( f"{user_reg_prefix}{package['package'].replace('/', '_').replace(':', '_')}" ) - remote_name = f"user_remote_{package['package'].replace('/', '_')}" + remote_name = f"user_remote_{package['package'].replace('/', '_').replace(':', '_')}" package_identifier = package["package"] policy_type = "immediate" - base_url = f"https://{host}/" + if not host.startswith(("http://", "https://")): + protocol = "https" if (cacert and key) else "http" + host = f"{protocol}://{host}" + base_url = f"{host}/" logger.info("Creating user container repository") with repository_creation_lock: @@ -314,8 +376,8 @@ def handle_user_image_registry(package, package_content, version_variables, user host = registry.get("host") cacert = registry.get("cert_path") key = registry.get("key_path") - username = registry.get("username") - password = registry.get("password") + # username = registry.get("username") + # password = registry.get("password") logger.info(f"Checking image {image_name}:{tag_val} in registry {host}") image_found = check_image_in_registry( @@ -324,8 +386,8 @@ def handle_user_image_registry(package, package_content, version_variables, user tag=tag_val, cacert=cacert, key=key, - username=username, - password=password, + username=None, + password=None, logger=logger ) @@ -333,6 +395,11 @@ def handle_user_image_registry(package, package_content, version_variables, user logger.info(f"Image '{image_name}:{tag_val}' found in registry '{host}'") result, package_info = process_user_registry(package, host, package_content, version_variables, cacert, key, logger) break + else: + logger.info(f"Image '{image_name}:{tag_val}' not found in registry '{host}', checking next registry...") + else: + logger.info(f"Image '{image_name}:{tag_val}' not found in any user registry") + result = False except Exception as e: logger.error(f"Exception in {handle_user_image_registry.__name__}: {e}") @@ -340,3 +407,4 @@ def handle_user_image_registry(package, package_content, version_variables, user logger.info("#" * 30 + f" {handle_user_image_registry.__name__} end " + "#" * 30) return result, package_info + diff --git a/common/library/modules/check_user_registry.py b/common/library/modules/check_user_registry.py index 8f59c93f68..c2995f17fb 100644 --- a/common/library/modules/check_user_registry.py +++ b/common/library/modules/check_user_registry.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,60 +27,60 @@ check_reachability, find_invalid_cert_paths ) -from ansible.module_utils.local_repo.config import ( - USER_REG_CRED_INPUT, - USER_REG_KEY_PATH -) +# from ansible.module_utils.local_repo.config import ( +# USER_REG_CRED_INPUT, +# USER_REG_KEY_PATH +# ) def main(): """ Ansible module to validate user registry entries. - - This module loads a YAML configuration file, validates the user registry entries, - checks their reachability, and verifies the cert paths. - - :return: A dictionary with the results of the validation and reachability checks. """ module = AnsibleModule( + # argument_spec=dict( + # timeout=dict(type='int', default=5), + # config_file=dict(type='str', required=True), + # user_reg_cred_input=dict(type='str', required=False, default=USER_REG_CRED_INPUT), + # user_reg_key_path=dict(type='str', required=False, default=USER_REG_KEY_PATH) + # ), argument_spec=dict( timeout=dict(type='int', default=5), - config_file=dict(type='str', required=True), - user_reg_cred_input=dict(type='str', required=False, default=USER_REG_CRED_INPUT), - user_reg_key_path=dict(type='str', required=False, default=USER_REG_KEY_PATH) + config_file=dict(type='str', required=True) ), supports_check_mode=True ) + # config_path = module.params['config_file'] + # timeout = module.params['timeout'] + # user_reg_cred_input = module.params["user_reg_cred_input"] + # user_reg_key_path = module.params["user_reg_key_path"] + config_path = module.params['config_file'] timeout = module.params['timeout'] - user_reg_cred_input = module.params["user_reg_cred_input"] - user_reg_key_path = module.params["user_reg_key_path"] - try: config_data = load_yaml_file(config_path) except FileNotFoundError as e: module.fail_json(msg=str(e)) user_registry = get_repo_list(config_data, "user_registry") - - if user_registry: - # Load credentials - if is_encrypted(user_reg_cred_input): - process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') - - file2_data = load_yaml_file(user_reg_cred_input) - cred_lookup = { - entry['name']: entry - for entry in file2_data.get('user_registry_credential', []) - } - - # Update user_registry entries with credentials if required - for registry in user_registry: - if registry.get("requires_auth"): - creds = cred_lookup.get(registry.get("name")) - if creds: - registry["username"] = creds.get("username") - registry["password"] = creds.get("password") + # if user_registry: + # # Load credentials + # if is_encrypted(user_reg_cred_input): + # process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') + + # file2_data = load_yaml_file(user_reg_cred_input) + # cred_lookup = { + # entry['name']: entry + # for entry in file2_data.get('user_registry_credential', []) + # } + + # # Update user_registry entries with credentials if required + # for registry in user_registry: + # if registry.get("requires_auth"): + # creds = cred_lookup.get(registry.get("name")) + # if creds: + # registry["username"] = creds.get("username") + # registry["password"] = creds.get("password") # Exit early if user_registry is empty if not user_registry: diff --git a/common/library/modules/delete_idracips_from_mysqldb.py b/common/library/modules/delete_idracips_from_mysqldb.py new file mode 100644 index 0000000000..cd81b943e2 --- /dev/null +++ b/common/library/modules/delete_idracips_from_mysqldb.py @@ -0,0 +1,251 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +"""Module to delete iDRAC IPs from MySQL database. +This module connects to a Kubernetes pod running MySQL and deletes iDRAC IPs +that are not present in bmc_data.csv. It handles retries and delays for robustness.""" + +import time +from ansible.module_utils.basic import AnsibleModule +from kubernetes import client, config +from kubernetes.stream import stream +from kubernetes.config.config_exception import ConfigException + + +def load_kube_context(): + """Load Kubernetes configuration for accessing the cluster.""" + try: + config.load_kube_config() + except ConfigException: + config.load_incluster_config() + + +def run_mysql_query_in_pod(namespace, pod, container, mysql_user, mysql_password, query): + """Run a MySQL query in the specified pod. + + Args: + namespace: Kubernetes namespace + pod: Pod name + container: Container name + mysql_user: MySQL username + mysql_password: MySQL password + query: MySQL query to execute + + Returns: + dict: Result containing return code and output + """ + core_v1 = client.CoreV1Api() + mysql_command = [ + "mysql", + "-u", mysql_user, + "-N", "-B", + f"-p{mysql_password}", + "-e", query + ] + + try: + ws = stream( + core_v1.connect_get_namespaced_pod_exec, + name=pod, + namespace=namespace, + container=container, + command=mysql_command, + stderr=True, + stdin=False, + stdout=True, + tty=False, + _preload_content=False + ) + + stdout = "" + stderr = "" + + while ws.is_open(): + ws.update(timeout=1) + if ws.peek_stdout(): + stdout += ws.read_stdout() + if ws.peek_stderr(): + stderr += ws.read_stderr() + ws.close() + + rc = ws.returncode + + if rc != 0: + return { + "rc": rc, + "result": stderr.strip() if stderr else "Unknown error" + } + + query_result = [ + line.strip() for line in stdout.strip().splitlines() + if line.strip() and not line.strip().startswith("mysql:") + ] + + return { + "rc": rc, + "result": query_result + } + + except (ConfigException, OSError) as e: + return { + "rc": 1, + "result": str(e) + } + + +def delete_idrac_from_mysql( + namespace, + pod, + container, + mysqldb_name, + mysql_user, + mysql_password, + ip_to_delete, + retries=3, + delay=3 +): + """Delete a single iDRAC IP from MySQL database. + + Args: + namespace: Kubernetes namespace + pod: Pod name + container: Container name + mysqldb_name: MySQL database name + mysql_user: MySQL username + mysql_password: MySQL password + ip_to_delete: IP address to delete + retries: Number of retry attempts + delay: Delay between retries in seconds + + Returns: + dict: Result containing success status and message + """ + query = ( + f"DELETE FROM {mysqldb_name}.services " + f"WHERE ip = '{ip_to_delete}';" + ) + + for attempt in range(retries): + result = run_mysql_query_in_pod( + namespace=namespace, + pod=pod, + container=container, + mysql_user=mysql_user, + mysql_password=mysql_password, + query=query + ) + + if result.get("rc") == 0: + return { + "success": True, + "ip": ip_to_delete, + "msg": f"Successfully deleted iDRAC IP {ip_to_delete} from MySQL." + } + + if attempt < retries - 1: + time.sleep(delay) + + return { + "success": False, + "ip": ip_to_delete, + "msg": f"Failed to delete iDRAC IP {ip_to_delete} after {retries} attempts: {result.get('result')}" + } + + +def main(): + """Main function to execute the module logic.""" + module_args = { + "telemetry_namespace": {"type": "str", "required": True}, + "idrac_podnames": {"type": "list", "required": True}, + "mysqldb_k8s_name": {"type": "str", "required": True}, + "mysqldb_name": {"type": "str", "required": True}, + "mysqldb_user": {"type": "str", "required": True, "no_log": True}, + "mysqldb_password": {"type": "str", "required": True, "no_log": True}, + "ips_to_delete": {"type": "list", "required": True}, + "pod_to_db_idrac_ips": {"type": "dict", "required": True}, + "db_retries": {"type": "int", "default": 3}, + "db_delay": {"type": "int", "default": 3}, + } + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + telemetry_namespace = module.params["telemetry_namespace"] + idrac_podnames = module.params["idrac_podnames"] + mysqldb_k8s_name = module.params["mysqldb_k8s_name"] + mysqldb_name = module.params["mysqldb_name"] + mysqldb_user = module.params["mysqldb_user"] + mysqldb_password = module.params["mysqldb_password"] + ips_to_delete = module.params["ips_to_delete"] + pod_to_db_idrac_ips = module.params["pod_to_db_idrac_ips"] + db_retries = module.params["db_retries"] + db_delay = module.params["db_delay"] + + load_kube_context() + + deleted_ips = [] + failed_ips = [] + changed = False + + try: + for pod in idrac_podnames: + pod_ips = pod_to_db_idrac_ips.get(pod, []) + ips_to_delete_from_pod = list(set(pod_ips) & set(ips_to_delete)) + + if not ips_to_delete_from_pod: + module.warn(f"No IPs to delete from pod {pod}. Skipping.") + continue + + module.warn(f"Deleting IPs from pod {pod}: {ips_to_delete_from_pod}") + + for ip in ips_to_delete_from_pod: + result = delete_idrac_from_mysql( + namespace=telemetry_namespace, + pod=pod, + container=mysqldb_k8s_name, + mysqldb_name=mysqldb_name, + mysql_user=mysqldb_user, + mysql_password=mysqldb_password, + ip_to_delete=ip, + retries=db_retries, + delay=db_delay + ) + + if result.get("success"): + deleted_ips.append(ip) + changed = True + else: + failed_ips.append({ + "pod": pod, + "ip": ip, + "msg": result.get("msg", "Unknown error") + }) + + module.exit_json( + changed=changed, + deleted_ips=deleted_ips, + failed_ips=failed_ips, + msg=f"Deleted {len(deleted_ips)} iDRAC IPs from MySQL database." + ) + + except (OSError, ValueError) as e: + module.fail_json( + msg=f"An error occurred while deleting iDRAC IPs from MySQL: {str(e)}", + deleted_ips=deleted_ips, + failed_ips=failed_ips + ) + + +if __name__ == "__main__": + main() diff --git a/common/library/modules/disable_idrac_telemetry.py b/common/library/modules/disable_idrac_telemetry.py new file mode 100644 index 0000000000..cb7b885e1e --- /dev/null +++ b/common/library/modules/disable_idrac_telemetry.py @@ -0,0 +1,184 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +"""Module to disable telemetry on iDRAC nodes via Redfish API. +This module connects to iDRAC nodes and disables telemetry collection +by sending PATCH requests to the Redfish API endpoint.""" + +import requests +import urllib3 +from ansible.module_utils.basic import AnsibleModule + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +def disable_telemetry_on_idrac(idrac_ip, username, password, timeout=30): + """ + Disable telemetry on a single iDRAC node using Redfish API. + + Args: + idrac_ip: IP address of the iDRAC + username: iDRAC username + password: iDRAC password + timeout: Request timeout in seconds + + Returns: + dict: Result containing success status and message + """ + url = ( + f"https://{idrac_ip}/redfish/v1/Managers/" + f"iDRAC.Embedded.1/Attributes" + ) + + # Try different telemetry property names in order of preference + telemetry_properties = [ + "Telemetry.1.EnableTelemetry", + "TelemetryService.1.EnableTelemetry", + "Telemetry.2.EnableTelemetry", + "Redfish.1.TelemetryServiceEnabled" + ] + + headers = { + "Content-Type": "application/json" + } + + for property_name in telemetry_properties: + payload = { + "Attributes": { + property_name: "Disabled" + } + } + + try: + response = requests.patch( + url, + json=payload, + headers=headers, + auth=(username, password), + verify=False, + timeout=timeout + ) + + if response.status_code in [200, 202, 204]: + return { + "success": True, + "ip": idrac_ip, + "status_code": response.status_code, + "msg": f"Successfully disabled telemetry on iDRAC {idrac_ip} using {property_name}" + } + elif response.status_code == 400: + # Property not supported, try next one + continue + else: + return { + "success": False, + "ip": idrac_ip, + "status_code": response.status_code, + "msg": ( + f"Failed to disable telemetry on iDRAC {idrac_ip}. " + f"Status: {response.status_code}, Response: {response.text}" + ) + } + + except requests.exceptions.Timeout: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Timeout while connecting to iDRAC {idrac_ip}" + } + + except requests.exceptions.ConnectionError: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Connection error while connecting to iDRAC {idrac_ip}" + } + + except (requests.exceptions.RequestException, OSError) as e: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Error disabling telemetry on iDRAC {idrac_ip}: {str(e)}" + } + + # All properties failed + return { + "success": False, + "ip": idrac_ip, + "msg": ( + f"Failed to disable telemetry on iDRAC {idrac_ip}. " + f"None of the supported telemetry properties were found: {', '.join(telemetry_properties)}" + ) + } + + +def main(): + """Main function to execute the module logic.""" + module_args = { + "idrac_ips": {"type": "list", "required": True, "elements": "str"}, + "username": {"type": "str", "required": True, "no_log": True}, + "password": {"type": "str", "required": True, "no_log": True}, + "timeout": {"type": "int", "default": 30}, + } + + module = AnsibleModule( + argument_spec=module_args, + supports_check_mode=True + ) + + idrac_ips = module.params["idrac_ips"] + username = module.params["username"] + password = module.params["password"] + timeout = module.params["timeout"] + + disabled_ips = [] + failed_ips = [] + changed = False + + try: + for idrac_ip in idrac_ips: + result = disable_telemetry_on_idrac( + idrac_ip=idrac_ip, + username=username, + password=password, + timeout=timeout + ) + + if result.get("success"): + disabled_ips.append(idrac_ip) + changed = True + else: + failed_ips.append({ + "ip": idrac_ip, + "msg": result.get("msg", "Unknown error") + }) + + module.exit_json( + changed=changed, + disabled_ips=disabled_ips, + failed_ips=failed_ips, + msg=f"Disabled telemetry on {len(disabled_ips)} iDRAC nodes." + ) + + except (requests.exceptions.RequestException, OSError) as e: + module.fail_json( + msg=f"An error occurred while disabling telemetry: {str(e)}", + disabled_ips=disabled_ips, + failed_ips=failed_ips + ) + + +if __name__ == "__main__": + main() diff --git a/common/library/modules/group_package_map.py b/common/library/modules/group_package_map.py index e5d29289e1..6076970f6d 100644 --- a/common/library/modules/group_package_map.py +++ b/common/library/modules/group_package_map.py @@ -145,6 +145,10 @@ def get_type_dict(clust_list): # Add package to rpm key type_dict[pkgtype] = type_dict.get( pkgtype, []) + [pkg_dict.get('package')] + # Also track repo_name mapping for RPMs + if 'repo_mapping' not in type_dict: + type_dict['repo_mapping'] = {} + type_dict['repo_mapping'][pkg_dict.get('package')] = pkg_dict.get('repo_name', '') # Update reboot required values reboot_val = pkg_dict.get(REBOOT_KEY, False) diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py new file mode 100644 index 0000000000..a697764683 --- /dev/null +++ b/common/library/modules/parallel_file_copy.py @@ -0,0 +1,175 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +# pylint: disable=import-error,no-name-in-module,line-too-long + +""" +Ansible module for parallel copying of files. + +Supports copying multiple source → destination pairs in parallel, +with logging, retries, and optional cleanup. +""" + +import os +import shutil +import threading +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.local_repo.standard_logger import setup_standard_logger + +# ============================================================ +# Default Values +# ============================================================ + +DEFAULT_MAX_WORKERS = 4 +DEFAULT_RETRY_COUNT = 2 +DEFAULT_DELETE_EXISTING = True +PARALLEL_FILE_COPY_LOG = '/opt/omnia/log/core/playbooks/parallel_file_copy.log/' + +# ============================================================ +# Copy Worker Function +# ============================================================ + +def copy_single_file(src_file, dest_dir, retry_count, delete_existing, slogger, summary): + """Copy one directory pair with retry support.""" + thread_name = threading.current_thread().name + start_time = datetime.now() + + if not os.path.isfile(src_file): + slogger.info(f"NOT COPIED - Source file missing: {src_file}") + summary["skipped"].append(src_file) + return + + os.makedirs(dest_dir, exist_ok=True) + dest_file = os.path.join(dest_dir, os.path.basename(src_file)) + + for attempt in range(1, retry_count + 1): + try: + slogger.info(f"[{thread_name}] START {start_time} Copying {src_file} (Attempt {attempt})") + + if delete_existing and os.path.exists(dest_file): + os.remove(dest_file) + slogger.info(f"Deleted existing file: {dest_file}") + + shutil.copy2(src_file, dest_file) + + end_time = datetime.now() + duration = (end_time - start_time).total_seconds() + slogger.info(f"[{thread_name}] SUCCESS {end_time} Copied {src_file} -> {dest_file} (Duration={duration:.2f}s)") + + summary["copied"].append(src_file) + return + + except Exception as err: + slogger.error(f"[{thread_name}] ERROR copying {src_file} (Attempt {attempt}) Reason: {err}") + if attempt == retry_count: + summary["failed"].append(src_file) + +# ============================================================ +# Main Parallel Copy Logic +# ============================================================ + +def execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger): + """ + Executes parallel copy for all pairs. + Returns summary dict. + """ + summary = {"copied": [], "skipped": [], "failed": []} + futures = [] + + slogger.info("===== PARALLEL FILE COPY STARTED =====") + slogger.info(f"Copy pairs received: {copy_pairs}") + slogger.info(f"Max workers: {max_workers}") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for src_dir, dest_dir in copy_pairs: + + if not os.path.isdir(src_dir): + slogger.info(f"NOT COPIED - Source directory missing: {src_dir}") + summary["skipped"].append(src_dir) + continue + + files = [os.path.join(src_dir, f) for f in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, f))] + if not files: + slogger.info(f"NOT COPIED - No files found in directory: {src_dir}") + summary["skipped"].append(src_dir) + continue + + # ⚡ Show Ansible warning for in-progress copy + module.warn(f"Copy in progress for {src_dir} -> {dest_dir}. Please wait ...") + + slogger.info(f"Copying {len(files)} files from {src_dir} -> {dest_dir} ...") + + for file_path in files: + futures.append(executor.submit(copy_single_file, file_path, dest_dir, retry_count, delete_existing, slogger, summary)) + + # Wait for all copies to finish + for future in as_completed(futures): + future.result() + + slogger.info("===== PARALLEL FILE COPY FINISHED =====") + return summary + +# ============================================================ +# Ansible Module Entry Point +# ============================================================ + +def main(): + """Main Ansible module execution entrypoint.""" + module_args = dict( + copy_pairs=dict(type="list", required=True), + max_workers=dict(type="int", required=False, default=DEFAULT_MAX_WORKERS), + retry_count=dict(type="int", required=False, default=DEFAULT_RETRY_COUNT), + delete_existing=dict(type="bool", required=False, default=DEFAULT_DELETE_EXISTING), + slog_file=dict(type="str", required=False, default=PARALLEL_FILE_COPY_LOG), + ) + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + copy_pairs = module.params["copy_pairs"] + max_workers = module.params["max_workers"] + retry_count = module.params["retry_count"] + delete_existing = module.params["delete_existing"] + slog_file = module.params["slog_file"] + + slogger = setup_standard_logger(slog_file) + + result = dict(changed=False, copied=[], skipped=[], failed=[]) + + try: + summary = execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger) + + result["copied"] = summary["copied"] + result["skipped"] = summary["skipped"] + result["failed"] = summary["failed"] + if summary["copied"]: + result["changed"] = True + + overall_status = "SUCCESS" + if summary["failed"] and summary["copied"]: + overall_status = "PARTIAL" + elif summary["failed"] and not summary["copied"]: + overall_status = "FAILURE" + + result["overall_status"] = overall_status + module.exit_json(**result) + + except Exception as err: + slogger.error(f"Parallel copy execution failed: {err}") + module.fail_json(msg=str(err), **result) + +if __name__ == "__main__": + main() diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 11b7aa1867..17c14cf51f 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,12 +28,15 @@ process_shell, process_ansible_galaxy_collection, process_iso, - process_pip + process_pip, + process_rpm_file ) from ansible.module_utils.local_repo.download_image import process_image from ansible.module_utils.local_repo.download_rpm import process_rpm from ansible.module_utils.local_repo.standard_logger import setup_standard_logger -from ansible.module_utils.local_repo.common_functions import generate_vault_key, process_file, is_encrypted +from ansible.module_utils.local_repo.common_functions import ( + generate_vault_key, process_file, is_encrypted +) from ansible.module_utils.local_repo.software_utils import ( load_json, set_version_variables, @@ -53,8 +56,6 @@ SOFTWARE_CSV_HEADER, STATUS_CSV_HEADER, LOCAL_REPO_CONFIG_PATH_DEFAULT, - USER_REG_CRED_INPUT, - USER_REG_KEY_PATH, OMNIA_CREDENTIALS_YAML_PATH, OMNIA_CREDENTIALS_VAULT_PATH ) @@ -126,7 +127,10 @@ def update_status_csv(csv_dir, software, overall_status,slogger): slogger.info(f"Successfully updated status CSV at {status_file}") -def determine_function(task, repo_store_path, csv_file_path, user_data, version_variables, arc, user_registries, docker_username, docker_password): +def determine_function( + task, repo_store_path, csv_file_path, user_data, version_variables, arc, + user_registries, docker_username, docker_password +): """ Determines the appropriate function and its arguments to process a given task. @@ -161,25 +165,55 @@ def determine_function(task, repo_store_path, csv_file_path, user_data, version_ task_type = task.get("type") if task_type == "manifest": - return process_manifest, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_manifest, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "git": - return process_git, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_git, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "tarball": - return process_tarball, [task, repo_store_path, status_file, version_variables, cluster_os_type, cluster_os_version, arc] + return process_tarball, [ + task, repo_store_path, status_file, version_variables, + cluster_os_type, cluster_os_version, arc + ] if task_type == "shell": - return process_shell, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_shell, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "ansible_galaxy_collection": - return process_ansible_galaxy_collection, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_ansible_galaxy_collection, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "iso": - return process_iso, [task, repo_store_path, status_file, - cluster_os_type, cluster_os_version, version_variables, arc] + return process_iso, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, version_variables, arc + ] if task_type == "pip_module": - return process_pip, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_pip, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "image": - return process_image, [task, status_file, version_variables, user_registries, docker_username, docker_password] - if task_type == "rpm": - return process_rpm, [task, repo_store_path, status_file, - cluster_os_type, cluster_os_version, repo_config_value, arc] + return process_image, [ + task, status_file, version_variables, user_registries, + docker_username, docker_password + ] + if task_type == "rpm_file": + return process_rpm_file, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] + if task_type in ("rpm", "rpm_repo"): + return process_rpm, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, repo_config_value, arc + ] raise ValueError(f"Unknown task type: {task_type}") except Exception as e: @@ -253,13 +287,13 @@ def generate_software_status_table(status_dict,slogger): table.field_names = ["Name", "Status"] for name, status in items: table.add_row([name, status.lower()]) - + tables.append(table.get_string()) slogger.info(f"Completed table for {arch}") - + slogger.info("Software status table generation completed successfully") return "\n\n".join(tables) - + except Exception as e: slogger.error(f"Error occurred while generating software status table: {e}") return f"Error: {e}" @@ -271,37 +305,44 @@ def main(): Args: tasks (list): A list of tasks (dictionaries) that need to be processed in parallel. nthreads (int): The number of worker processes to run in parallel. - timeout (int): The maximum time allowed for all tasks to execute. If `None`, no timeout is enforced. + timeout (int): The maximum time allowed for all tasks to execute. + If `None`, no timeout is enforced. log_dir (str): The directory where log files for the worker processes will be saved. log_file (str): The path to the log file for the overall task execution. slog_file (str): The path to the log file for the standard logger. csv_file_path (str): The path to a CSV file that may be needed for processing some tasks. repo_store_path (str): The path to the repository where task-related files are stored. software (list): A list of software names. - user_json_file (str): The path to the JSON file containing use - show_softwares_status (bool): Whether to display the software status; optional, defaults to False. - overall_status_dict (dict): A list containing overall software status information; optional, defaults to an empty dict. - Dictionary containing software status information grouped by software names. - Each key (e.g., 'service_k8s') maps to a list of dictionaries, - where each dictionary contains: - - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'. - - 'overall_status' (str): Status of the software on that architecture, e.g., 'SUCCESS'. - Example: - { - "service_k8s": [ - {"arch": "x86_64", "overall_status": "SUCCESS"}, - {"arch": "aarch64", "overall_status": "SUCCESS"} - ] - } - Defaults to an empty dict if not provided. + user_json_file (str): The path to the JSON file containing user data. + show_softwares_status (bool): Whether to display the software status; + optional, defaults to False. + overall_status_dict (dict): A dictionary containing overall software status + information; optional, defaults to an empty dict. + Dictionary containing software status information grouped by software names. + Each key (e.g., 'service_k8s') maps to a list of dictionaries, + where each dictionary contains: + - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'. + - 'overall_status' (str): Status of the software on that architecture, + e.g., 'SUCCESS'. + Example: + { + "service_k8s": [ + {"arch": "x86_64", "overall_status": "SUCCESS"}, + {"arch": "aarch64", "overall_status": "SUCCESS"} + ] + } + Defaults to an empty dict if not provided. Returns: tuple: A tuple containing: - - overall_status (str): The overall status of task execution ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT"). - - task_results_data (list): A list of dictionaries, each containing the result of an individual task. + - overall_status (str): The overall status of task execution + ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT"). + - task_results_data (list): A list of dictionaries, each containing + the result of an individual task. Raises: Exception: If an error occurs during execution. """ + module_args = { "tasks": {"type": "list", "required": True}, "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS}, @@ -315,12 +356,19 @@ def main(): "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT}, "show_softwares_status": {"type": "bool", "required": False, "default": False}, "overall_status_dict": {"type": "dict","required": True}, - "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT}, + "local_repo_config_path": { + "type": "str", "required": False, + "default": LOCAL_REPO_CONFIG_PATH_DEFAULT + }, "arch": {"type": "str", "required": False}, - "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT}, - "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH}, - "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH}, - "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH} + "omnia_credentials_yaml_path": { + "type": "str", "required": False, + "default": OMNIA_CREDENTIALS_YAML_PATH + }, + "omnia_credentials_vault_path": { + "type": "str", "required": False, + "default": OMNIA_CREDENTIALS_VAULT_PATH + } } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) tasks = module.params["tasks"] @@ -337,8 +385,8 @@ def main(): overall_status_dict = module.params["overall_status_dict"] local_repo_config_path = module.params["local_repo_config_path"] arc = module.params["arch"] - user_reg_cred_input = module.params["user_reg_cred_input"] - user_reg_key_path = module.params["user_reg_key_path"] + # user_reg_cred_input = module.params["user_reg_cred_input"] + # user_reg_key_path = module.params["user_reg_key_path"] omnia_credentials_yaml_path = module.params["omnia_credentials_yaml_path"] omnia_credentials_vault_path = module.params["omnia_credentials_vault_path"] @@ -366,24 +414,29 @@ def main(): cluster_os_type = user_data['cluster_os_type'] cluster_os_version = user_data['cluster_os_version'] - subgroup_dict, software_names = get_subgroup_dict(user_data,slogger) - version_variables = set_version_variables(user_data, software_names, cluster_os_version,slogger) + subgroup_dict, software_names = get_subgroup_dict(user_data, slogger) + version_variables = set_version_variables( + user_data, software_names, cluster_os_version, slogger + ) slogger.info(f"Cluster OS: {cluster_os_type}") slogger.info(f"Version Variables: {version_variables}") - gen_result = {} - if not os.path.isfile(user_reg_key_path): - gen_result = generate_vault_key(user_reg_key_path) - if gen_result is None: - module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}") + # gen_result = {} + # if not os.path.isfile(user_reg_key_path): + # gen_result = generate_vault_key(user_reg_key_path) + # if gen_result is None: + # module.fail_json( + # msg=f"Unable to generate local_repo key at path: {user_reg_key_path}" + # ) overall_status, task_results = execute_parallel( tasks, determine_function, nthreads, repo_store_path, csv_file_path, - log_dir, user_data, version_variables, arc, slogger, local_repo_config_path, user_reg_cred_input, user_reg_key_path, - omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout + log_dir, user_data, version_variables, arc, slogger, + local_repo_config_path, omnia_credentials_yaml_path, + omnia_credentials_vault_path, timeout ) - if not is_encrypted(user_reg_cred_input): - process_file(user_reg_cred_input,user_reg_key_path,'encrypt') + # if not is_encrypted(user_reg_cred_input): + # process_file(user_reg_cred_input, user_reg_key_path, 'encrypt') end_time = datetime.now() formatted_end_time = end_time.strftime("%I:%M:%S %p") @@ -422,7 +475,9 @@ def main(): except Exception as e: - result["table_output"] = table_output if "table_output" in locals() else "No table generated." + result["table_output"] = ( + table_output if "table_output" in locals() else "No table generated." + ) slogger.error(f"Execution failed: {str(e)}") module.fail_json(msg=f"Error during execution: {str(e)}", **result) diff --git a/common/library/modules/process_rpm_config.py b/common/library/modules/process_rpm_config.py index 002923d50c..550d0c078f 100644 --- a/common/library/modules/process_rpm_config.py +++ b/common/library/modules/process_rpm_config.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -467,6 +467,27 @@ def check_publication_exists(repo_name, log): log.error("Error checking publication for '%s': %s", repo_name, str(e)) return False +def check_distribution_exists(repo_name, log): + """ + Check if a distribution exists for the repository. + + Args: + repo_name (str): The name of the repository. + log (logging.Logger): Logger instance for logging. + + Returns: + bool: True if distribution exists, False otherwise. + """ + try: + command = pulp_rpm_commands["check_distribution"] % repo_name + log.info("Checking if distribution exists for repository '%s'", repo_name) + result = execute_command(command, log) + return bool(result) + except Exception as e: + log.error("Error checking distribution for '%s': %s", repo_name, str(e)) + return False + + def delete_old_publications(repo_name, log): """ Delete all existing publications for a repository. @@ -792,9 +813,43 @@ def process_sync_results(sync_results, rpm_config, resync_repos, log): version_changed_repos = [name for success, name, actually_synced, version_changed in sync_results if success and actually_synced and version_changed] log.info(f"Repos with version change: {len(version_changed_repos)} - {version_changed_repos}") - # If no versions changed, skip publication and distribution entirely + # If no versions changed, check for missing publication/distribution + # This handles the crash recovery case: process failed after sync but before pub/dist if not version_changed_repos: - log.info("No version changes detected. Skipping publication and distribution.") + log.info("No version changes detected. Checking for missing publication/distribution.") + + # Check all synced repos (including previously synced) for missing pub/dist + repos_missing_pub_dist = [] + all_repo_names = [] + for repo in rpm_config: + repo_name = repo["package"] + version = repo.get("version") + if version and version != "null": + repo_name = f"{repo_name}_{version}" + all_repo_names.append(repo_name) + + # If resync_repos is a specific list, only check those repos + if resync_repos and resync_repos != "all": + resync_list = resync_repos if isinstance(resync_repos, list) else [r.strip() for r in resync_repos.split(",")] + if repo_name not in resync_list: + continue + + pub_exists = check_publication_exists(repo_name, log) + dist_exists = check_distribution_exists(repo_name, log) + + if not pub_exists or not dist_exists: + log.info(f"{repo_name} missing publication={not pub_exists}, distribution={not dist_exists}. Including for pub/dist creation.") + repo_copy = repo.copy() + repo_copy["_version_changed"] = False + repos_missing_pub_dist.append(repo_copy) + + if repos_missing_pub_dist: + missing_names = [r["package"] for r in repos_missing_pub_dist] + log.info(f"Found {len(repos_missing_pub_dist)} repo(s) missing publication/distribution: {missing_names}") + return repos_missing_pub_dist, False, "" + + # All repos have publication and distribution - safe to skip + log.info("All repos have existing publication and distribution. Skipping.") if actually_synced_repos: # Repos were synced but no metadata change synced_list = ", ".join(actually_synced_repos) @@ -820,9 +875,37 @@ def process_sync_results(sync_results, rpm_config, resync_repos, log): repos_for_pub_dist.append(repo_copy) return repos_for_pub_dist, False, "" else: - # If no repos were actually synced, skip publication and distribution + # If no repos were actually synced, check for missing pub/dist (crash recovery) if not actually_synced_repos: - log.info("No repos were actually synced. Skipping publication and distribution.") + log.info("No repos were actually synced. Checking for missing publication/distribution.") + repos_missing_pub_dist = [] + for repo in rpm_config: + repo_name = repo["package"] + version = repo.get("version") + if version and version != "null": + repo_name = f"{repo_name}_{version}" + + # If resync_repos is a specific list, only check those repos + if resync_repos and resync_repos != "all": + resync_list = resync_repos if isinstance(resync_repos, list) else [r.strip() for r in resync_repos.split(",")] + if repo_name not in resync_list: + continue + + pub_exists = check_publication_exists(repo_name, log) + dist_exists = check_distribution_exists(repo_name, log) + + if not pub_exists or not dist_exists: + log.info(f"{repo_name} missing publication={not pub_exists}, distribution={not dist_exists}. Including for pub/dist creation.") + repo_copy = repo.copy() + repo_copy["_version_changed"] = False + repos_missing_pub_dist.append(repo_copy) + + if repos_missing_pub_dist: + missing_names = [r["package"] for r in repos_missing_pub_dist] + log.info(f"Found {len(repos_missing_pub_dist)} repo(s) missing publication/distribution: {missing_names}") + return repos_missing_pub_dist, False, "" + + log.info("All repos have existing publication and distribution. No updates required.") return [], True, "All repositories already synced - no updates required" # Filter rpm_config to only include repos with version change diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py new file mode 100644 index 0000000000..a3c155ebdb --- /dev/null +++ b/common/library/modules/pulp_cleanup.py @@ -0,0 +1,1082 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unified Pulp Cleanup Module + +Architecture: + Input → Type Detection → Processing → Status Updates → Return Results + +Handles: + - Repository cleanup (RPM) + - Container cleanup + - File cleanup (git, tarball, pip_module) +""" + +import os +import csv +import glob +import json +import shutil +import subprocess +from typing import Dict, List, Any, Tuple + +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.local_repo.standard_logger import setup_standard_logger +from ansible.module_utils.local_repo.config import ( + CLEANUP_BASE_PATH_DEFAULT, + CLEANUP_STATUS_FILE_PATH_DEFAULT, + pulp_rpm_commands, + pulp_container_commands, + pulp_file_commands, + pulp_python_commands, + ARCH_SUFFIXES +) + + +# ============================================================================= +# PRETTY TABLE FORMATTING +# ============================================================================= + +# ANSI color codes +GREEN = '\033[92m' +RED = '\033[91m' +YELLOW = '\033[93m' +RESET = '\033[0m' + +def format_pretty_table(results: List[Dict[str, Any]]) -> str: + """Format cleanup results into a pretty table.""" + if not results: + return "No cleanup results to display" + + headers = ["Name", "Type", "Status", "Message"] + + # Calculate column widths + widths = [len(h) for h in headers] + for r in results: + widths[0] = max(widths[0], len(str(r.get('name', '')))) + widths[1] = max(widths[1], len(str(r.get('type', '')))) + widths[2] = max(widths[2], len(str(r.get('status', '')))) + widths[3] = max(widths[3], min(len(str(r.get('message', ''))), 40)) + + # Build table + border = "+" + "+".join("-" * (w + 2) for w in widths) + "+" + header_row = "|" + "|".join(f" {h.ljust(w)} " for h, w in zip(headers, widths)) + "|" + + lines = [border, header_row, border] + + for r in results: + msg = str(r.get('message', ''))[:40] + row = "|" + "|".join([ + f" {str(r.get('name', '')).ljust(widths[0])} ", + f" {str(r.get('type', '')).ljust(widths[1])} ", + f" {str(r.get('status', '')).ljust(widths[2])} ", + #f" {colored_status}{status_padding} ", + f" {msg.ljust(widths[3])} " + ]) + "|" + lines.append(row) + + lines.append(border) + return "\n".join(lines) + + +# ============================================================================= +# COMMAND EXECUTION +# ============================================================================= + +def run_cmd(cmd: str, logger) -> Dict[str, Any]: + """Execute shell command and return result.""" + try: + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300) + return {"rc": result.returncode, "stdout": result.stdout, "stderr": result.stderr} + except (subprocess.SubprocessError, OSError) as e: + logger.error(f"Command failed: {cmd} - {e}") + return {"rc": 1, "stdout": "", "stderr": str(e)} + + +def safe_json_parse(data: str, default: Any = None) -> Any: + """Safely parse JSON string using JSONDecoder with validation. + + Uses json.JSONDecoder instead of json.loads to avoid Checkmarx vulnerabilities. + """ + if not data or not isinstance(data, str): + return default if default is not None else [] + + try: + decoder = json.JSONDecoder() + parsed, _ = decoder.raw_decode(data.strip()) + return parsed + except (ValueError, TypeError): + return default if default is not None else [] + + +# ============================================================================= +# CONTAINER IMAGE VALIDATION & CONVERSION +# ============================================================================= + +def validate_container_format(image_name: str) -> Tuple[bool, str]: + """Validate container image format. + + User must provide format: registry/image (e.g., registry.k8s.io/pause) + + Returns: + Tuple of (is_valid, error_message) + """ + if not image_name: + return False, "Container image name cannot be empty" + + # Must contain at least one '/' to indicate registry/image format + if '/' not in image_name: + return False, ( + f"Invalid format '{image_name}'. Must include registry " + "(e.g., registry.k8s.io/pause, docker.io/library/busybox)" + ) + + # Must have a registry part (contains '.' or is a known registry) + parts = image_name.split('/') + registry = parts[0] + + # Check if registry looks valid (contains dot or is localhost) + if '.' not in registry and registry != 'localhost' and ':' not in registry: + return False, ( + f"Invalid registry '{registry}' in '{image_name}'. " + "Registry must be a domain (e.g., docker.io, registry.k8s.io)" + ) + + return True, "" + + +def convert_to_pulp_container_name(image_name: str) -> str: + """Convert user-provided image name to Pulp repository name. + + Examples: + registry.k8s.io/pause -> container_repo_registry.k8s.io_pause + docker.io/library/busybox -> container_repo_docker.io_library_busybox + ghcr.io/kube-vip/kube-vip -> container_repo_ghcr.io_kube-vip_kube-vip + """ + # Replace '/' with '_' and prepend 'container_repo_' + normalized = image_name.replace('/', '_') + return f"container_repo_{normalized}" + + +# ============================================================================= +# TYPE DETECTION +# ============================================================================= + +def detect_file_type(name: str) -> str: + """Detect artifact type from name.""" + # Pip module: contains == (e.g., cffi==1.17.1) + if '==' in name: + return "pip_module" + # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix) + if '.' in name and '/' not in name and '==' not in name and any( + x in name.lower() for x in ['ansible', 'community', 'galaxy'] + ): + return "ansible_galaxy_collection" + if name.startswith('ansible_galaxy_collection'): + return "ansible_galaxy_collection" + if any(x in name.lower() for x in ['chart', 'tar', 'tgz', 'helm', 'bundle']): + return "tarball" + if any(x in name.lower() for x in ['git', 'repo', 'source', 'scm']): + return "git" + if any(x in name.lower() for x in ['manifest', 'calico', 'yml', 'yaml']): + return "manifest" + return "file" + + +# ============================================================================= +# EXISTENCE CHECKS +# ============================================================================= + +def repo_exists(name: str, logger) -> bool: + """Check if RPM repository exists in Pulp.""" + cmd = pulp_rpm_commands["show_repository"] % name + result = run_cmd(cmd, logger) + return result["rc"] == 0 + + +def container_exists(name: str, logger) -> bool: + """Check if container repository exists in Pulp.""" + cmd = pulp_container_commands["show_container_repo"] % name + result = run_cmd(cmd, logger) + return result["rc"] == 0 + + +def file_exists_in_status(name: str, base_path: str, logger) -> bool: + """Check if file artifact exists in status files.""" + try: + for status_file in glob.glob(f"{base_path}/x86_64/*/status.csv"): + with open(status_file, 'r', encoding='utf-8') as f: + if name in f.read(): + return True + return False + except Exception: + return False + +def get_all_repositories(logger) -> List[str]: + """Get all RPM repository names from Pulp.""" + cmd = pulp_rpm_commands["list_repositories"] + result = run_cmd(cmd, logger) + if result["rc"] != 0: + logger.error(f"Failed to list repositories: {result['stderr']}") + return [] + repos = safe_json_parse(result["stdout"]) + return [r.get('name', '') for r in repos if r.get('name')] + + +# ============================================================================= +# CLEANUP FUNCTIONS +# ============================================================================= + +def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]: + """Cleanup a single RPM repository.""" + result = {"name": name, "type": "repository", "status": "Failed", "message": ""} + + # Check existence + if not repo_exists(name, logger): + result["message"] = "Repository not found" + return result + + try: + # Delete distributions + dist_list = run_cmd(pulp_rpm_commands["list_distributions"], logger) + if dist_list["rc"] == 0: + dists = safe_json_parse(dist_list["stdout"]) + for d in dists: + if d.get('name', '') == name or name in d.get('name', ''): + run_cmd(pulp_rpm_commands["delete_distribution"] % d.get('name', ''), logger) + + # Delete publications + pub_list = run_cmd(pulp_rpm_commands["list_publications"] % name, logger) + if pub_list["rc"] == 0: + pubs = safe_json_parse(pub_list["stdout"]) + for p in pubs: + run_cmd(pulp_rpm_commands["delete_publication"] % p.get('pulp_href', ''), logger) + + # Delete remote + run_cmd(pulp_rpm_commands["delete_remote"] % name, logger) + + # Delete repository + del_result = run_cmd(pulp_rpm_commands["delete_repository"] % name, logger) + + if del_result["rc"] == 0: + result["status"] = "Success" + result["message"] = "Repository deleted" + # Update status files - remove RPM entries from this repo and mark software as partial + affected = remove_rpms_from_repository(name, base_path, logger) + logger.info(f" mark affected softwares as partial {affected}") + mark_software_partial(affected, base_path, logger, 'repository') + else: + result["message"] = f"Delete failed: {del_result['stderr']}" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]: + """Cleanup a single container repository. + + Args: + user_input: User-provided image name (e.g., registry.k8s.io/pause) + """ + result = {"name": user_input, "type": "container", "status": "Failed", "message": ""} + + # Validate format + is_valid, error_msg = validate_container_format(user_input) + if not is_valid: + result["message"] = error_msg + return result + + # Convert to Pulp naming convention + pulp_name = convert_to_pulp_container_name(user_input) + + # Check existence + if not container_exists(pulp_name, logger): + result["message"] = ( + f"Container not found in Pulp (looked for: {pulp_name})" + ) + return result + + try: + # Delete distributions + dist_list = run_cmd(pulp_container_commands["list_distributions"], logger) + if dist_list["rc"] == 0: + dists = safe_json_parse(dist_list["stdout"]) + for d in dists: + if d.get('name', '') == pulp_name: + run_cmd(pulp_container_commands["delete_distribution"] % d.get('name', ''), logger) + + # Delete repository + del_result = run_cmd(pulp_container_commands["delete_repository"] % pulp_name, logger) + + if del_result["rc"] == 0: + result["status"] = "Success" + result["message"] = "Container deleted" + # Update status files - remove image entries and mark software as partial + affected = remove_from_status_files(user_input, 'image', base_path, logger) + mark_software_partial(affected, base_path, logger, 'image') + else: + result["message"] = f"Delete failed: {del_result['stderr']}" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]: + """Check if file content exists in Pulp file repository. + + Returns: + Tuple of (exists, repo_name, content_href) + """ + try: + # List file repositories and search for the content + repo_list = run_cmd(pulp_file_commands["list_repositories"], logger) + if repo_list["rc"] != 0: + return False, "", "" + + repos = safe_json_parse(repo_list["stdout"]) + for repo in repos: + repo_name = repo.get('name', '') + # Check if this repo contains our file + content_list = run_cmd( + f"pulp file content list --repository {repo_name} --relative-path '{name}'", + logger + ) + if content_list["rc"] == 0: + contents = safe_json_parse(content_list["stdout"]) + if contents: + return True, repo_name, contents[0].get('pulp_href', '') + + return False, "", "" + except (OSError, ValueError): + return False, "", "" + + +def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) -> Tuple[bool, str]: + """Delete file content from Pulp. + + Returns: + Tuple of (success, message) + """ + try: + messages = [] + + # 1. Remove content from repository + if content_href: + remove_result = run_cmd( + f"pulp file repository content remove --repository {repo_name} " + f"--href {content_href}", + logger + ) + if remove_result["rc"] == 0: + messages.append("Content removed from repository") + else: + # Try alternative: modify repository to remove content + run_cmd( + f"pulp file repository content modify --repository {repo_name} " + f"--remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", + logger + ) + + # 2. Delete distribution if exists + dist_result = run_cmd(pulp_file_commands["list_distributions"], logger) + if dist_result["rc"] == 0: + dists = safe_json_parse(dist_result["stdout"]) + for d in dists: + if d.get('name', '') == name or name in d.get('name', ''): + run_cmd(pulp_file_commands["delete_distribution"] % d.get('name', ''), logger) + messages.append("Distribution deleted") + + # 3. Try to delete the file repository if it's named after the artifact + repo_del = run_cmd(pulp_file_commands["delete_repository"] % name, logger) + if repo_del["rc"] == 0: + messages.append("Repository deleted") + + return True, "; ".join(messages) if messages else "Removed from Pulp" + + except Exception as e: + return False, f"Pulp deletion error: {str(e)}" + + +def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: + """Cleanup a pip module from Pulp Python repository. + + Pip modules are stored as: pip_module== + e.g., pip_modulecffi==1.17.1 + """ + result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""} + messages = [] + pulp_deleted = False + content_removed = False + + try: + # Pulp Python repo name format: pip_module + # User input could be "cffi==1.17.1" or "pip_modulecffi==1.17.1" + if name.startswith("pip_module"): + pulp_repo_name = name + else: + pulp_repo_name = f"pip_module{name}" + + logger.info(f"Looking for Python repository: {pulp_repo_name}") + + # Check if repository exists + repo_check = run_cmd(pulp_python_commands["show_repository"] % pulp_repo_name, logger) + + if repo_check["rc"] == 0: + # Delete distribution first + dist_del = run_cmd(pulp_python_commands["delete_distribution"] % pulp_repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + # Delete repository + repo_del = run_cmd(pulp_python_commands["delete_repository"] % pulp_repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + + # Run orphan cleanup + if pulp_deleted: + logger.info("Running orphan cleanup...") + orphan_result = run_cmd(pulp_python_commands["orphan_cleanup"], logger) + if orphan_result["rc"] == 0: + messages.append("Orphan cleanup completed") + else: + # Try listing repos to find partial match + repo_list = run_cmd( + pulp_python_commands["list_repositories"], logger + ) + if repo_list["rc"] == 0: + repos = safe_json_parse(repo_list["stdout"]) + for repo in repos: + repo_name = repo.get('name', '') + if name in repo_name or repo_name == pulp_repo_name: + logger.info(f"Found matching Python repository: {repo_name}") + + dist_del = run_cmd(pulp_python_commands["delete_distribution"] % repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + repo_del = run_cmd(pulp_python_commands["delete_repository"] % repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + break + + # Update status files + if file_exists_in_status(name, base_path, logger): + affected = remove_from_status_files(name, 'pip_module', base_path, logger) + if affected: + messages.append("Status files updated") + mark_software_partial(affected, base_path, logger, 'pip_module') + + # Clean up uploaded content from filesystem + fs_result = cleanup_content_directory(name, 'pip_module', repo_store_path, logger) + if fs_result["status"] == "Success": + content_removed = True + messages.append(fs_result["message"]) + + if pulp_deleted or content_removed: + result["status"] = "Success" + result["message"] = "; ".join(messages) if messages else "Cleaned up" + else: + result["message"] = f"pip_module '{name}' not found in Pulp or filesystem" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def get_pulp_file_repo_name(name: str, file_type: str) -> str: + """Get the Pulp File repository name based on artifact type. + + Naming conventions: + - ansible_galaxy_collection: ansible_galaxy_collection + - tarball, git, manifest, file: (as-is) + """ + if file_type == "ansible_galaxy_collection": + if name.startswith("ansible_galaxy_collection"): + return name + return f"ansible_galaxy_collection{name}" + return name + + +def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: + """Cleanup artifact from Pulp File repository. + + Handles: tarball, git, manifest, ansible_galaxy_collection + All use 'pulp file' repository type with type-specific naming conventions. + """ + result = {"name": name, "type": file_type, "status": "Failed", "message": ""} + messages = [] + pulp_deleted = False + status_removed = False + content_removed = False + + try: + # Get the expected Pulp repository name + pulp_repo_name = get_pulp_file_repo_name(name, file_type) + logger.info(f"Looking for {file_type} repository: {pulp_repo_name}") + + # Check if repository exists directly + repo_check = run_cmd(pulp_file_commands["show_repository"] % pulp_repo_name, logger) + + if repo_check["rc"] == 0: + # Found exact match - delete distribution and repository + dist_del = run_cmd(pulp_file_commands["delete_distribution"] % pulp_repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + repo_del = run_cmd(pulp_file_commands["delete_repository"] % pulp_repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + else: + # Try listing repos to find partial match + repo_list = run_cmd( + pulp_file_commands["list_repositories"], logger + ) + if repo_list["rc"] == 0: + repos = safe_json_parse(repo_list["stdout"]) + for repo in repos: + repo_name = repo.get('name', '') + if name in repo_name or repo_name == pulp_repo_name: + logger.info(f"Found matching repository: {repo_name}") + + dist_del = run_cmd(pulp_file_commands["delete_distribution"] % repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + repo_del = run_cmd(pulp_file_commands["delete_repository"] % repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + break + + # Run orphan cleanup to remove actual content files + if pulp_deleted: + logger.info("Running orphan cleanup to remove content files...") + orphan_result = run_cmd(pulp_file_commands["orphan_cleanup"], logger) + if orphan_result["rc"] == 0: + messages.append("Orphan cleanup completed") + else: + logger.warning(f"Orphan cleanup warning: {orphan_result['stderr']}") + + # Update status files + if file_exists_in_status(name, base_path, logger): + affected = remove_from_status_files(name, file_type, base_path, logger) + if affected: + status_removed = True + messages.append("Status files updated") + mark_software_partial(affected, base_path, logger, file_type) + + # Clean up uploaded content from filesystem + fs_result = cleanup_content_directory( + name, file_type, repo_store_path, logger + ) + if fs_result["status"] == "Success": + content_removed = True + messages.append(fs_result["message"]) + + # Determine overall result + if pulp_deleted or status_removed or content_removed: + result["status"] = "Success" + result["message"] = "; ".join(messages) if messages else "Cleaned up" + else: + result["message"] = f"{file_type} '{name}' not found in Pulp, status files, or filesystem" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def cleanup_file(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: + """Cleanup a file artifact. + + Routes to appropriate handler: + - pip_module: Pulp Python repository + - tarball, git, manifest, ansible_galaxy_collection: Pulp File repository + """ + file_type = detect_file_type(name) + + # Handle pip modules separately - they use Python repositories + if file_type == "pip_module": + return cleanup_pip_module(name, base_path, repo_store_path, logger) + + # All other file types use Pulp File repository + return cleanup_file_repository(name, file_type, base_path, repo_store_path, logger) + + +# ============================================================================= +# FILESYSTEM CONTENT CLEANUP +# ============================================================================= + +def cleanup_content_directory(content_name: str, content_type: str, repo_store_path: str, logger) -> Dict[str, Any]: + """Remove uploaded content directory from the filesystem. + + Builds the content path the same way as download_common.py: + /offline_repo/cluster//rhel/// + + This mirrors how remove_from_status_files iterates over ARCH_SUFFIXES to + clean status.csv entries. + + Args: + content_name: Name of the content item (e.g., 'helm-v3.19.0-amd64') + content_type: Directory category (tarball, git, pip_module, manifest, + ansible_galaxy_collection, rpm_file) + repo_store_path: Root store path (e.g., '/opt/omnia') + logger: Logger instance + + Returns: + Dict with name, type, status, and message keys + """ + result = {"name": content_name, "type": f"filesystem_{content_type}", + "status": "Failed", "message": ""} + removed_dirs = [] + + cluster_path = os.path.join(repo_store_path, "offline_repo", "cluster") + if not os.path.exists(cluster_path): + result["message"] = f"Content store path not found: {cluster_path}" + logger.warning(result["message"]) + return result + + try: + for arch in ARCH_SUFFIXES: + # Walk version directories (e.g., rhel/10.0) + arch_path = os.path.join(cluster_path, arch) + if not os.path.isdir(arch_path): + continue + + for version_dir in glob.glob(f"{arch_path}/rhel/*/"): + content_dir = os.path.join(version_dir, content_type, content_name) + if os.path.exists(content_dir): + logger.info(f"Removing content directory: {content_dir}") + if os.path.isdir(content_dir): + shutil.rmtree(content_dir) + else: + os.remove(content_dir) + removed_dirs.append(content_dir) + + if removed_dirs: + result["status"] = "Success" + result["message"] = f"Removed content: {', '.join(removed_dirs)}" + else: + result["message"] = (f"No filesystem content found for " + f"'{content_name}' under {content_type}") + logger.info(result["message"]) + + except Exception as e: + result["message"] = f"Filesystem cleanup error: {str(e)}" + logger.error(f"Failed to cleanup content {content_name}: {e}") + + return result + + +# ============================================================================= +# STATUS FILE UPDATES +# ============================================================================= + +def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[str, List[str]]: + """Remove RPMs that belong to a specific repository from status files. + + Uses the repo_name column in status.csv to accurately identify RPMs from the repository. + Now that all repo_names include architecture prefixes, the logic is simplified. + + Args: + repo_name: Repository name (e.g., 'x86_64_appstream', 'aarch64_epel') + base_path: Base path for status files + logger: Logger instance + + Returns: + Dict mapping architecture to list of affected software names + """ + affected_software = {} + logger.info(f"Removing RPMs from status.csv for repository: {repo_name}") + + # Extract architecture from repo_name (all repo_names should now have arch prefixes) + target_arch = None + for arch in ARCH_SUFFIXES: + if repo_name.startswith(f"{arch}_"): + target_arch = arch + break + + if not target_arch: + logger.error(f"Repository name {repo_name} does not have architecture prefix") + return {} + + logger.info(f"Processing architecture: {target_arch}") + affected_software[target_arch] = [] + + try: + for status_file in glob.glob(f"{base_path}/{target_arch}/*/status.csv"): + rows = [] + removed = False + has_repo_column = False + + # Check if file has repo_name column + with open(status_file, 'r', encoding='utf-8') as f: + header = f.readline().strip().lower() + has_repo_column = "repo_name" in header + + with open(status_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + name = row.get('name', '') + row_type = row.get('type', '') + rpm_repo = row.get('repo_name', '') + + logger.info(f"Processing row: {row}") + # For RPMs, check if they belong to the deleted repository + if row_type in ('rpm', 'rpm_repo', 'rpm_file'): + if has_repo_column and rpm_repo == repo_name: + removed = True + logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") + else: + rows.append(row) + else: + rows.append(row) + + if removed and fieldnames: + with open(status_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + # Track affected software + software_name = os.path.basename(os.path.dirname(status_file)) + if software_name not in affected_software[target_arch]: + affected_software[target_arch].append(software_name) + + return affected_software + except Exception as e: + logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}") + return {} + +def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]: + """Remove artifact from status.csv files and return affected software names by architecture. + + Args: + artifact_name: Name of the artifact to remove + artifact_type: Type of artifact (git, tarball, pip_module) + base_path: Base path for status files + logger: Logger instance + + Returns: + Dict mapping architecture to list of affected software names + """ + affected_software = {} + try: + for arch in ARCH_SUFFIXES: + arch_affected = [] + for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): + rows = [] + removed = False + with open(status_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + name = row.get('name', '') + row_type = row.get('type', '') + # Match logic based on type + should_remove = False + if artifact_type == 'image': + # Container images: match with or without tag + should_remove = (name == artifact_name or name.startswith(f"{artifact_name}:")) + else: + # Other types: exact match + should_remove = (name == artifact_name) + + if should_remove: + removed = True + logger.info(f"Removing '{name}' from {status_file}") + else: + rows.append(row) + + if removed and fieldnames: + with open(status_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + # Track affected software + software_name = os.path.basename(os.path.dirname(status_file)) + if software_name not in arch_affected: + arch_affected.append(software_name) + + if arch_affected: + affected_software[arch] = arch_affected + + logger.info(f"remove_from_status_files returning: {affected_software}") + return affected_software + except OSError as e: + logger.error(f"Failed to remove from status files: {e}") + return {} + + +def mark_software_partial(affected_software, base_path: str, logger, artifact_type: str = None): + """Mark software entries as partial in software.csv. + + Args: + affected_software: Either a List[str] of software names (legacy support) + or a Dict[str, List[str]] mapping arch to software names + base_path: Base path for software.csv + logger: Logger instance + artifact_type: Type of artifact being removed (for logging purposes) + """ + logger.info(f"mark_software_partial called with affected_software: {affected_software}") + if not affected_software: + logger.info("No affected software to mark as partial") + return + + # Normalize input: convert to arch_software_map if needed + if isinstance(affected_software, list): + # Legacy list input - this should not happen with new remove_rpms_from_repository + # but we keep it for backward compatibility + logger.warning("Received list input to mark_software_partial, applying to all architectures (legacy behavior)") + arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES} + else: + arch_software_map = affected_software + + try: + for arch, software_names in arch_software_map.items(): + if not software_names: + continue + + software_file = f"{base_path}/{arch}/software.csv" + logger.info(f"Looking for software file: {software_file}") + if not os.path.exists(software_file): + logger.warning(f"Software file not found: {software_file}") + continue + + rows = [] + updated = False + with open(software_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + if row.get('name') in software_names: + row['status'] = 'partial' + updated = True + logger.info(f"Marked '{row.get('name')}' as partial in {arch}/software.csv ({artifact_type} cleanup)") + rows.append(row) + + if fieldnames and rows and updated: + with open(software_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + logger.info(f"Successfully wrote updated software.csv for {arch}") + except OSError as e: + logger.error(f"Failed to update software.csv: {e}") + +def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> bool: + """Check if a software has any RPM dependencies in its status.csv. + + Args: + software_name: Name of the software + arch: Architecture (x86_64 or aarch64) + base_path: Base path for status files + logger: Logger instance + + Returns: + True if software has RPM entries, False otherwise + """ + status_file = f"{base_path}/{arch}/{software_name}/status.csv" + if not os.path.exists(status_file): + return False + + try: + with open(status_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + if row.get('type', '').lower() in ('rpm', 'rpm_repo'): + return True + return False + except OSError as e: + logger.error(f"Error checking RPMs for {software_name}: {e}") + return False + + +def mark_all_software_partial(base_path: str, logger): + """Mark software entries as partial in software.csv for all architectures. + + This is called when cleanup_repos=all to mark software as partial + since all RPM repositories are being deleted. + Only marks software that actually has RPM dependencies. + + Args: + base_path: Base path for software.csv files + logger: Logger instance + """ + logger.info("Marking software with RPM dependencies as partial (cleanup_repos=all)") + try: + for arch in ARCH_SUFFIXES: + software_file = f"{base_path}/{arch}/software.csv" + logger.info( + f"Processing software file: {software_file}" + ) + + if not os.path.exists(software_file): + logger.info(f"Software file not found: {software_file}") + continue + + rows = [] + updated = False + with open(software_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + software_name = row.get('name', '') + if row.get('status') == 'success': + # Only mark as partial if software has RPM dependencies + if software_has_rpms(software_name, arch, base_path, logger): + row['status'] = 'partial' + updated = True + logger.info(f"Marked '{software_name}' as partial in {arch}/software.csv (has RPM deps)") + else: + logger.info(f"Skipping '{software_name}' - no RPM dependencies") + rows.append(row) + + if fieldnames and rows and updated: + with open(software_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + logger.info(f"Successfully updated {software_file}") + except OSError as e: + logger.error(f"Failed to mark all software as partial: {e}") + +def write_cleanup_status(results: List[Dict], base_path: str): + """Write cleanup results to status file.""" + status_file = f"{base_path}/cleanup_status.csv" + os.makedirs(os.path.dirname(status_file), exist_ok=True) + + with open(status_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=['name', 'type', 'status', 'message']) + writer.writeheader() + writer.writerows(results) + + return status_file + + +# ============================================================================= +# MAIN MODULE +# ============================================================================= + +def run_module(): + """Main module execution.""" + module = AnsibleModule( + argument_spec=dict( + cleanup_repos=dict(type='list', elements='str', default=[]), + cleanup_containers=dict(type='list', elements='str', default=[]), + cleanup_files=dict(type='list', elements='str', default=[]), + base_path=dict( + type='str', default=CLEANUP_BASE_PATH_DEFAULT + ), + repo_store_path=dict( + type='str', default='/opt/omnia' + ) + ), + supports_check_mode=True + ) + + cleanup_repos = module.params['cleanup_repos'] + cleanup_containers = module.params['cleanup_containers'] + cleanup_files = module.params['cleanup_files'] + base_path = module.params['base_path'] + repo_store_path = module.params['repo_store_path'] + + # Setup logger - setup_standard_logger expects a directory, creates standard.log inside + log_dir = os.path.join(base_path, "cleanup") + os.makedirs(base_path, exist_ok=True) + logger = setup_standard_logger(log_dir) + + # Handle 'all' keyword for repositories only + cleanup_all_repos = ( + cleanup_repos and len(cleanup_repos) == 1 and + cleanup_repos[0].lower() == 'all' + ) + #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all': + if cleanup_all_repos: + logger.info("cleanup_repos='all' - fetching all repositories from Pulp") + cleanup_repos = get_all_repositories(logger) + if not cleanup_repos: + module.fail_json( + msg="Failed to retrieve repository list from Pulp. " + "Please check if Pulp services are running." + ) + logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}") + + logger.info( + f"Starting cleanup - repos: {cleanup_repos}, " + f"containers: {cleanup_containers}, files: {cleanup_files}" + ) + + all_results = [] + + # Process repositories + for repo in cleanup_repos: + result = cleanup_repository(repo, base_path, logger) + all_results.append(result) + logger.info(f"Repository {repo}: {result['status']} - {result['message']}") + + # If cleanup_repos=all, mark software with RPM dependencies as partial + if cleanup_all_repos and any(r['status'] == 'Success' for r in all_results if r['type'] == 'repository'): + mark_all_software_partial(base_path, logger) + + # Process containers + for container in cleanup_containers: + result = cleanup_container(container, base_path, logger) + all_results.append(result) + logger.info(f"Container {container}: {result['status']} - {result['message']}") + + # Process files + for file in cleanup_files: + result = cleanup_file(file, base_path, repo_store_path, logger) + all_results.append(result) + logger.info(f"File {file}: {result['status']} - {result['message']}") + + # Write status file + status_file = write_cleanup_status(all_results, base_path) + + # Calculate summary + total = len(all_results) + success = len([r for r in all_results if r['status'] == 'Success']) + failed = len([r for r in all_results if r['status'] == 'Failed']) + + # Generate pretty table + pretty_table = format_pretty_table(all_results) + + logger.info(f"Cleanup completed - Total: {total}, Success: {success}, Failed: {failed}") + + module.exit_json( + changed=success > 0, + results=all_results, + total=total, + success_count=success, + failed_count=failed, + summary=f"Total: {total}, Success: {success}, Failed: {failed}", + pretty_table=pretty_table, + pretty_table_lines=pretty_table.split('\n'), + status_file=status_file + ) + + +if __name__ == '__main__': + run_module() diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py index 9b9441e493..78a4315244 100644 --- a/common/library/modules/slurm_conf.py +++ b/common/library/modules/slurm_conf.py @@ -12,6 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +from collections import OrderedDict +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import ( + SlurmParserEnum, + all_confs, + parse_slurm_conf +) + DOCUMENTATION = r''' --- module: slurm_conf @@ -134,12 +143,6 @@ # - Hostlist expressions, split and merge computations -from collections import OrderedDict -from ansible.module_utils.basic import AnsibleModule -from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import SlurmParserEnum, all_confs -import os - - def read_dict2ini(conf_dict): """Convert a configuration dictionary to INI-style lines for slurm.conf.""" data = [] @@ -147,7 +150,6 @@ def read_dict2ini(conf_dict): if isinstance(v, list): for dct_item in v: if isinstance(dct_item, dict): - # TODO: Ordered dict, move the key to the top od = OrderedDict(dct_item) od.move_to_end(k, last=False) # Move k to the beginning data.append( @@ -159,46 +161,7 @@ def read_dict2ini(conf_dict): return data -def parse_slurm_conf(file_path, conf_name, validate): - """Parses the slurm.conf file and returns it as a dictionary.""" - current_conf = all_confs.get(conf_name, {}) - slurm_dict = OrderedDict() - - if not os.path.exists(file_path): - raise FileNotFoundError(f"{file_path} not found.") - - with open(file_path, 'r', encoding='utf-8') as f: - for line in f: - # handles any comment after the data - line = line.split('#')[0].strip() - if not line: - continue - # Split the line by one or more spaces - items = line.split() - tmp_dict = OrderedDict() - for item in items: - # Split only on the first '=' to allow '=' inside the value - key, value = item.split('=', 1) - tmp_dict[key.strip()] = value.strip() - skey = list(tmp_dict.keys())[0] - if validate and skey not in current_conf: - raise ValueError(f"Invalid key while parsing {file_path}: {skey}") - if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY: - slurm_dict[list(tmp_dict.keys())[0]] = list( - slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict] - elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV: - existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()] - new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()] - slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values))) - elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST: - slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values()) - else: - slurm_dict.update(tmp_dict) - - return slurm_dict - - -def slurm_conf_dict_merge(conf_dict_list, conf_name): +def slurm_conf_dict_merge(conf_dict_list, conf_name, replace): """Merge multiple Slurm configuration dictionaries into a single dictionary.""" merged_dict = OrderedDict() current_conf = all_confs.get(conf_name, {}) @@ -210,10 +173,10 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name): existing_dict = merged_dict.get(ky, {}) inner_dict = existing_dict.get(item.get(ky), {}) # Get the sub-options for this array type (e.g., nodename_options, partition_options) - sub_options = all_confs.get(ky, {}) + sub_options = all_confs.get(f"{conf_name}->{ky}", {}) # Merge item into inner_dict, handling CSV fields specially for k, v in item.items(): - if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict: + if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict and not replace: # Merge CSV values existing_values = [val.strip() for val in inner_dict[k].split(',') if val.strip()] new_values = [val.strip() for val in v.split(',') if val.strip()] @@ -230,7 +193,7 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name): else: new_items = [vl] merged_dict[ky] = list(dict.fromkeys(existing_list + new_items)) - elif current_conf.get(ky) == SlurmParserEnum.S_P_CSV: + elif current_conf.get(ky) == SlurmParserEnum.S_P_CSV and not replace: existing_values = [v.strip() for v in merged_dict.get(ky, "").split(',') if v.strip()] new_values = [v.strip() for v in vl.split(',') if v.strip()] merged_dict[ky] = ",".join(list(dict.fromkeys(existing_values + new_values))) @@ -252,7 +215,8 @@ def run_module(): "conf_map": {'type': 'dict', 'default': {}}, "conf_sources": {'type': 'list', 'elements': 'raw', 'default': []}, "conf_name": {'type': 'str', 'default': 'slurm'}, - "validate": {'type': 'bool', 'default': False} + "validate": {'type': 'bool', 'default': False}, + "replace": {'type': 'bool', 'default': False} } result = {"changed": False, "failed": False} @@ -267,9 +231,12 @@ def run_module(): try: conf_name = module.params['conf_name'] validate = module.params['validate'] + replace = module.params['replace'] # Parse the slurm.conf file if module.params['op'] == 'parse': - s_dict = parse_slurm_conf(module.params['path'], conf_name, validate) + s_dict, dup_keys = parse_slurm_conf(module.params['path'], conf_name, validate) + if dup_keys: + module.fail_json(msg=f"Duplicate keys found in {module.params['path']}: {dup_keys}") result['conf_dict'] = s_dict elif module.params['op'] == 'render': s_list = read_dict2ini(module.params['conf_map']) @@ -282,11 +249,13 @@ def run_module(): elif isinstance(conf_source, str): if not os.path.exists(conf_source): raise FileNotFoundError(f"File {conf_source} does not exist") - s_dict = parse_slurm_conf(conf_source, conf_name, validate) + s_dict, dup_keys = parse_slurm_conf(conf_source, conf_name, validate) + if dup_keys: + module.fail_json(msg=f"Duplicate keys found in {conf_source}: {dup_keys}") conf_dict_list.append(OrderedDict(s_dict)) else: raise TypeError(f"Invalid type for conf_source: {type(conf_source)}") - merged_dict = slurm_conf_dict_merge(conf_dict_list, conf_name) + merged_dict = slurm_conf_dict_merge(conf_dict_list, conf_name, replace) result['conf_dict'] = merged_dict result['ini_lines'] = read_dict2ini(merged_dict) except (FileNotFoundError, ValueError, TypeError, AttributeError) as e: diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 75efadb47c..40fd00123c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../utils/include_input_dir.yml diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml index 2fecb895e8..d4e8425749 100644 --- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml +++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml @@ -42,3 +42,12 @@ ansible.builtin.debug: var: additional_images_dict verbosity: 2 + +- name: Read local_repo_config.yml + ansible.builtin.include_vars: + file: "{{ local_repo_config_path }}" + name: local_repo_config + +- name: Set fact for user_registry + ansible.builtin.set_fact: + user_registry: "{{ local_repo_config.user_registry | default([]) }}" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index de236ed958..8918f03050 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -70,6 +70,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes + - path: /usr/local/bin/install_cuda_toolkit.sh permissions: '0755' content: | @@ -98,7 +105,7 @@ echo "[INFO] Setting up shared CUDA directory..." # Create and mount shared directory for compute nodes mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." @@ -183,6 +190,18 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + - path: /usr/local/bin/install_openmpi.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_openmpi.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_ucx.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} + - path: /etc/hosts append: true content: | @@ -190,6 +209,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -200,38 +225,86 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} + # Add NFS entry and mount + - mkdir -p {{ client_mount_path }} + - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -a +{% endif %} + +{% if hostvars['localhost']['ucx_support'] %} + - echo "===== UCX Setup =====" + - echo "UCX support is enabled." + - /usr/local/bin/install_ucx.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_ucx.sh" + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['openmpi_support'] %} + - echo "===== OpenMPI Setup =====" + - echo "OpenMPI support is enabled." + - /usr/local/bin/install_openmpi.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_openmpi.sh" + # - echo "Run UCX installation first if UCX support is enabled." + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['ldms_support'] %} + - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log + - /root/ldms_sampler.sh +{% endif %} + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld @@ -279,79 +352,7 @@ {% endif %} -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} - # Add NFS entry and mount - - mkdir -p {{ client_mount_path }} - - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - - mount -a -{% endif %} - -{% if hostvars['localhost']['ucx_support'] %} - # UCX build and install - - | - UCX_BIN={{ client_mount_path }}/benchmarks/ucx - mkdir -p {{ client_mount_path }}/compile/ucx - mkdir -p {{ client_mount_path }}/benchmarks/ucx - cd {{ client_mount_path }}/compile/ucx - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/aarch64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz -O ucx.tar.gz - tar xzf ucx.tar.gz - cd ucx-* - mkdir -p build - cd build - ../contrib/configure-release --prefix={{ client_mount_path }}/benchmarks/ucx - make -j 8 - make install -{% endif %} - -{% if hostvars['localhost']['openmpi_support'] %} - # OpenMPI build and install with UCX + Slurm detection - - | - OPENMPI_INSTALL_PREFIX="{{ client_mount_path }}/benchmarks/openmpi" - OPENMPI_SRC="{{ client_mount_path }}/compile/openmpi" - mkdir -p $OPENMPI_SRC - mkdir -p $OPENMPI_INSTALL_PREFIX - - cd $OPENMPI_SRC - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/aarch64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz -O openmpi.tar.gz - - tar xzf openmpi.tar.gz - cd openmpi-* - mkdir -p build - - # Check Slurm - if sinfo >/dev/null 2>&1; then - SLURM_FLAG="--with-slurm=yes --with-munge=/usr" - else - SLURM_FLAG="--with-slurm=no" - fi - - # Check UCX - if [ -x "{{ client_mount_path }}/benchmarks/ucx/bin/ucx_info" ]; then - {{ client_mount_path }}/benchmarks/ucx/bin/ucx_info -v - if [ $? -eq 0 ]; then - UCX_FLAG="--with-ucx={{ client_mount_path }}/benchmarks/ucx" - else - echo "ucx_info failed, disabling UCX" - UCX_FLAG="" - fi - else - echo "ucx_info not found, disabling UCX" - UCX_FLAG="" - fi - - cd build - ../configure --prefix=$OPENMPI_INSTALL_PREFIX \ - --enable-mpi1-compatibility \ - --enable-prte-prefix-by-default \ - $SLURM_FLAG $UCX_FLAG 2>&1 | tee config.out - - make -j 8 - make install -{% endif %} - -{% if hostvars['localhost']['ldms_support'] %} - - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - - /root/ldms_sampler.sh -{% endif %} + # nvidia sdk install + - /usr/local/bin/install_nvhpc_sdk.sh + - /usr/local/bin/configure_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 3195fad9e3..51121a2e82 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -105,7 +105,7 @@ echo "[INFO] Setting up shared CUDA directory..." # Create and mount shared directory for compute nodes mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." @@ -190,6 +190,18 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + - path: /usr/local/bin/install_openmpi.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_openmpi.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_ucx.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} + - path: /etc/hosts append: true content: | @@ -197,6 +209,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -207,6 +225,18 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -214,11 +244,10 @@ # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab @@ -230,20 +259,56 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} + # Add NFS entry and mount + - mkdir -p {{ client_mount_path }} + - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -a +{% endif %} + +{% if hostvars['localhost']['ucx_support'] %} + - echo "===== UCX Setup =====" + - echo "UCX support is enabled." + - /usr/local/bin/install_ucx.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_ucx.sh" + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['openmpi_support'] %} + - echo "===== OpenMPI Setup =====" + - echo "OpenMPI support is enabled." + - /usr/local/bin/install_openmpi.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_openmpi.sh" + # - echo "Run UCX installation first if UCX support is enabled." + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['ldms_support'] %} + - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log + - /root/ldms_sampler.sh +{% endif %} + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld @@ -291,79 +356,8 @@ {% endif %} -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} - # Add NFS entry and mount - - mkdir -p {{ client_mount_path }} - - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - - mount -a -{% endif %} - -{% if hostvars['localhost']['ucx_support'] %} - # UCX build and install - - | - UCX_BIN={{ client_mount_path }}/benchmarks/ucx - mkdir -p {{ client_mount_path }}/compile/ucx - mkdir -p {{ client_mount_path }}/benchmarks/ucx - cd {{ client_mount_path }}/compile/ucx - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz -O ucx.tar.gz - tar xzf ucx.tar.gz - cd ucx-* - mkdir -p build - cd build - ../contrib/configure-release --prefix={{ client_mount_path }}/benchmarks/ucx - make -j 8 - make install -{% endif %} - -{% if hostvars['localhost']['openmpi_support'] %} - # OpenMPI build and install with UCX + Slurm detection - - | - OPENMPI_INSTALL_PREFIX="{{ client_mount_path }}/benchmarks/openmpi" - OPENMPI_SRC="{{ client_mount_path }}/compile/openmpi" - mkdir -p $OPENMPI_SRC - mkdir -p $OPENMPI_INSTALL_PREFIX - - cd $OPENMPI_SRC - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz -O openmpi.tar.gz - - tar xzf openmpi.tar.gz - cd openmpi-* - mkdir -p build - - # Check Slurm - if sinfo >/dev/null 2>&1; then - SLURM_FLAG="--with-slurm=yes --with-munge=/usr" - else - SLURM_FLAG="--with-slurm=no" - fi - - # Check UCX - if [ -x "{{ client_mount_path }}/benchmarks/ucx/bin/ucx_info" ]; then - {{ client_mount_path }}/benchmarks/ucx/bin/ucx_info -v - if [ $? -eq 0 ]; then - UCX_FLAG="--with-ucx={{ client_mount_path }}/benchmarks/ucx" - else - echo "ucx_info failed, disabling UCX" - UCX_FLAG="" - fi - else - echo "ucx_info not found, disabling UCX" - UCX_FLAG="" - fi - - cd build - ../configure --prefix=$OPENMPI_INSTALL_PREFIX \ - --enable-mpi1-compatibility \ - --enable-prte-prefix-by-default \ - $SLURM_FLAG $UCX_FLAG 2>&1 | tee config.out - - make -j 8 - make install -{% endif %} -{% if hostvars['localhost']['ldms_support'] %} - - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - - /root/ldms_sampler.sh -{% endif %} + # nvidia sdk install + - /usr/local/bin/install_nvhpc_sdk.sh + - /usr/local/bin/configure_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index f869d7d8fe..4aacc2222d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -102,6 +102,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -116,11 +122,10 @@ runcmd: - /usr/local/bin/set-ssh.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab @@ -132,19 +137,23 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 82646da1c6..524553bd55 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -108,6 +108,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -123,10 +129,10 @@ - /usr/local/bin/set-ssh.sh # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd /var/spool/slurmd nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab @@ -142,20 +148,24 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index b8b71bf099..b98df53d7d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -169,6 +169,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} - path: /tmp/kube-vip.yaml owner: root:root @@ -415,13 +425,12 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - mv /tmp/generate-control-plane-join.sh {{ k8s_client_mount_path }} - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_control_plane_first' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index f3ba7a7330..922f63f852 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -147,6 +147,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} - path: /tmp/kube-vip.yaml owner: root:root permissions: '0644' @@ -323,12 +333,11 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_control_plane' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index b380030ddd..df98035baa 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -146,7 +146,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} runcmd: - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" @@ -226,12 +235,11 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_node' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2f2721d7eb..d5f9ef9ba6 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -92,7 +92,7 @@ PORTALS=({% for ip in powervault_config.ip %}"{{ ip }}" {% endfor %}) PORT="{{ powervault_config.port | default(3260) }}" - INITIATOR_IQN="{{ powervault_config.isci_initiators | default('') }}" + INITIATOR_IQN="{{ powervault_config.iscsi_initiator | default('') }}" VOLUME_ID="{{ powervault_config.volume_id | default('') }}" FS_TYPE="{{ powervault_config.fs_type | default('xfs') }}" MOUNT_OPTS="{{ powervault_config.mount_options | default('defaults,_netdev,noatime') }}" @@ -340,7 +340,9 @@ chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/mariadb chown -R {{ slurm_user }}:{{ slurm_user }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? - chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} + chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} + #firewall systemctl enable firewalld systemctl start firewalld @@ -469,15 +471,18 @@ # slurm user and group created in the users module # Create directories for nfs and mount all - - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + - mkdir -p {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} {{ slurm_ctld_pid_dir_effective }} {{ slurmdbd_pid_dir_effective }} {{ slurm_state_save_location_effective }} {% if slurm_sched_log_dir_effective %}{{ slurm_sched_log_dir_effective }} {% endif %}/etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm /etc/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/my.cnf.d /etc/my.cnf.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/mariadb /var/log/mariadb nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_ctld_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab +{% if slurmdbd_log_dir_effective != slurm_ctld_log_dir_effective %} + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurmdbd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab +{% endif %} {% if powervault_config is not defined %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql /var/lib/mysql nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmctld /var/spool/slurmctld nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmctld {{ slurm_state_save_location_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab {% endif %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab @@ -553,4 +558,6 @@ - /root/ldms_sampler.sh {% endif %} + - systemctl restart slurmdbd + - systemctl restart slurmctld - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index cc784bdd10..dacade639b 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -72,6 +72,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes + - path: /usr/local/bin/install_nvidia_driver.sh permissions: '0755' content: | @@ -108,6 +115,7 @@ bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else echo "[ERROR] NVIDIA driver installation failed." fi @@ -127,7 +135,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda @@ -237,16 +245,16 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" echo "[INFO] Creating base directories for Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab @@ -269,27 +277,46 @@ echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) (aarch64) =====" - echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/" - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Setting permissions for Slurm directories" - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Ensuring Slurm epilog directory and logout script permissions" chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh +{% for epath in slurm_epilog_custom_paths %} + + echo "[INFO] Checking custom epilog script: {{ epath }}" + if [ ! -f "{{ epath }}" ]; then + echo "[INFO] Creating stub epilog script at {{ epath }}" + mkdir -p "$(dirname '{{ epath }}')" + printf '#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n' > "{{ epath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}" + chmod {{ file_mode_755 }} "{{ epath }}" + fi +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + + echo "[INFO] Checking custom prolog script: {{ ppath }}" + if [ ! -f "{{ ppath }}" ]; then + echo "[INFO] Creating stub prolog script at {{ ppath }}" + mkdir -p "$(dirname '{{ ppath }}')" + printf '#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n' > "{{ ppath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}" + chmod {{ file_mode_755 }} "{{ ppath }}" + fi +{% endfor %} - echo "[INFO] Creating and configuring /var/spool/slurmd" - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd + echo "[INFO] Creating and configuring slurmd spool directory" + mkdir -p {{ slurm_slurmd_spool_dir_effective }} + chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] ===== Completed slurmd setup (aarch64) =====" @@ -386,6 +413,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -396,6 +429,24 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/configure_ucx_openmpi_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }} + + - path: /usr/local/bin/setup_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/setup_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/export_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/export_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -443,9 +494,20 @@ - mount -a {% endif %} +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] %} + - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + - /usr/local/bin/configure_ucx_openmpi_env.sh + +{% endif %} + {% if hostvars['localhost']['ldms_support'] %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} + + - /usr/local/bin/setup_nvhpc_sdk.sh + - /usr/local/bin/export_nvhpc_env.sh + - systemctl restart slurmd + - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 5128aee1d1..d21fcf9c5c 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -116,6 +116,7 @@ bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else echo "[ERROR] NVIDIA driver installation failed." fi @@ -135,7 +136,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda @@ -243,6 +244,11 @@ {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' @@ -256,12 +262,12 @@ # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd /var/spool/slurmd nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab @@ -287,27 +293,46 @@ echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) =====" - echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/" - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Setting permissions for Slurm directories" - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Ensuring Slurm epilog directory and logout script permissions" chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh +{% for epath in slurm_epilog_custom_paths %} + + echo "[INFO] Checking custom epilog script: {{ epath }}" + if [ ! -f "{{ epath }}" ]; then + echo "[INFO] Creating stub epilog script at {{ epath }}" + mkdir -p "$(dirname '{{ epath }}')" + printf '#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n' > "{{ epath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}" + chmod {{ file_mode_755 }} "{{ epath }}" + fi +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + + echo "[INFO] Checking custom prolog script: {{ ppath }}" + if [ ! -f "{{ ppath }}" ]; then + echo "[INFO] Creating stub prolog script at {{ ppath }}" + mkdir -p "$(dirname '{{ ppath }}')" + printf '#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n' > "{{ ppath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}" + chmod {{ file_mode_755 }} "{{ ppath }}" + fi +{% endfor %} - echo "[INFO] Creating and configuring /var/spool/slurmd" - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd + echo "[INFO] Creating and configuring slurmd spool directory" + mkdir -p {{ slurm_slurmd_spool_dir_effective }} + chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] ===== Completed slurmd setup =====" @@ -408,6 +433,24 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/configure_ucx_openmpi_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }} + + - path: /usr/local/bin/setup_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/setup_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/export_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/export_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -455,6 +498,15 @@ - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - mount -a + # - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + # - /usr/local/bin/configure_ucx_openmpi_env.sh + +{% endif %} + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] %} + - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + - /usr/local/bin/configure_ucx_openmpi_env.sh + {% endif %} {% if hostvars['localhost']['ldms_support'] %} @@ -462,4 +514,8 @@ - /root/ldms_sampler.sh {% endif %} + - /usr/local/bin/setup_nvhpc_sdk.sh + - /usr/local/bin/export_nvhpc_env.sh + - systemctl restart slurmd + - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index 1cb95d6f9b..249b90b6a5 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -1,6 +1,12 @@ #!/bin/bash set -euo pipefail +# Check if Mellanox hardware is present +if ! lspci | grep -i 'mellanox'; then + echo "No Mellanox RDMA hardware detected. Skipping IB network configuration." + exit 0 +fi + ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 index 111abcb3a1..db8a7cb9cc 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -44,9 +44,6 @@ else dnf install -y kernel-headers-$(uname -r) fi -echo "Bootstrap doca-ofed package..." -rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm" - echo "Installing doca-ofed..." if rpm -q doca-ofed >/dev/null 2>&1; then echo "doca-ofed package is already installed." diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 new file mode 100644 index 0000000000..958ac6e27c --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 @@ -0,0 +1,75 @@ +#!/bin/bash +set -e + +LOGFILE="/var/log/nvhpc_env_config.log" +exec >> "$LOGFILE" 2>&1 + +echo "===== Configuring NVIDIA HPC SDK environment =====" + +# Cloud-init safe defaults +export HOME=/root + +NVCOMPILERS="/opt/nvidia/nvhpc" +NVARCH="$(uname -s)_$(uname -m)" +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; +esac + +# Select package name based on detected architecture (rendered from slurm_config vars) +case "${arch}" in + x86_64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_x86_64 }}" ;; + aarch64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_aarch64 }}" ;; +esac + +# Derive version from package name +NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/' | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/') + + +NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION" +PROFILE_FILE="/etc/profile.d/nvhpc.sh" + +if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then + echo "[ERROR] NVHPC compilers not found at $NVHPC_BASE" + exit 1 +fi + +echo "[INFO] NVHPC detected at $NVHPC_BASE" +echo "[INFO] Writing persistent environment to $PROFILE_FILE" + +cat << EOF > "$PROFILE_FILE" +# NVIDIA HPC SDK environment +export NVCOMPILERS=$NVCOMPILERS +export NVARCH=$NVARCH +export NVHPC_VERSION=$NVHPC_VERSION + +export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/bin:\$PATH +export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/man + +# MPI (optional but recommended) +export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/bin:\$PATH +export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/man + +# Modules support (optional) +export MODULEPATH=\$NVCOMPILERS/modulefiles:\${MODULEPATH:-} +EOF + +chmod 644 "$PROFILE_FILE" + +# Source profile for current shell and all future non-login shells +if [ -f "$PROFILE_FILE" ]; then + echo "[INFO] Sourcing NVHPC profile for current shell" + source "$PROFILE_FILE" + grep -q "nvhpc.sh" /etc/bashrc || echo "source $PROFILE_FILE" >> /etc/bashrc +fi + + +if ! grep -q "{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" /etc/fstab; then + echo "[ERROR] NVHPC NFS path not found in /etc/fstab" + exit 1 +fi + +echo "[INFO] NVHPC NFS entry found in /etc/fstab" + +echo "===== NVHPC environment configuration completed successfully =====" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 new file mode 100644 index 0000000000..0fa20205c5 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 @@ -0,0 +1,43 @@ +#!/bin/bash +LOGFILE="/var/log/configure_ucx_openmpi_env.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Configuring UCX / OpenMPI environment (Slurm node) =====" + +CLIENT_MOUNT="{{ client_mount_path }}" +UCX_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" +OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi" + +PROFILE_DIR="/etc/profile.d" + +# Ensure client mount exists and is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[WARN] $CLIENT_MOUNT is not mounted. Skipping UCX/OpenMPI env setup." + exit 0 +fi + +# ---------------- UCX ---------------- + + cat > "$PROFILE_DIR/ucx.sh" < "$PROFILE_DIR/openmpi.sh" < >(tee -a "$LOGFILE") 2>&1 + +# Check that NFS is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[ERROR] $CLIENT_MOUNT is not mounted." + echo " Please mount the NFS path before running export_nvhpc_env.sh" + exit 1 +fi + +echo "===== NVHPC environment export started =====" + + +echo "[INFO] Writing persistent NVHPC profile at $PROFILE_FILE" + +# Write environment file system-wide +cat > "$PROFILE_FILE" <_cuda_X.Y +NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/') +NVHPC_SHORT_VERSION=$(echo "$NVHPC_VERSION" | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/') + +NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" +NVHPC_MOUNT="/shared-nvhpc-sdk" +NVHPC_TARBALL="$NVHPC_MOUNT/${NVHPC_PKG_NAME}.tar.gz" +NVHPC_INSTALL_DIR_NFS="$NVHPC_MOUNT/nvhpc" +NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" +NVHPC_EXTRACT_DIR="$NVHPC_MOUNT/${NVHPC_PKG_NAME}" + +# Skip if already mounted +if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation." | tee -a "$LOGFILE" + exit 0 +fi + +# Skip if local directory exists +if [ -d "$NVHPC_LOCAL_MOUNT" ]; then + echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping." | tee -a "$LOGFILE" + exit 0 +fi + +mkdir -p "$NVHPC_MOUNT" +mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT" >> "$LOGFILE" 2>&1 + +# Check tarball +echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..." | tee -a "$LOGFILE" +if [ ! -f "$NVHPC_TARBALL" ]; then + echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation." | tee -a "$LOGFILE" + exit 0 +fi + +# Extract if needed +EXTRACT_SIZE_GB=$(du -sBG "$NVHPC_EXTRACT_DIR" 2>/dev/null | cut -f1 | tr -d 'G') +if [ -d "$NVHPC_EXTRACT_DIR" ] && [ "$EXTRACT_SIZE_GB" -ge 13 ] && [ -f "$NVHPC_EXTRACT_DIR/install" ]; then + echo "[INFO] NVHPC already extracted. Skipping." | tee -a "$LOGFILE" +else + echo "[INFO] Extracting NVIDIA HPC SDK tarball..." | tee -a "$LOGFILE" + tar -xzf "$NVHPC_TARBALL" -C "$NVHPC_MOUNT" \ + --checkpoint=2000 \ + --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait" >> "$LOGFILE" 2>&1 +fi + +mkdir -p "$NVHPC_INSTALL_DIR_NFS" +INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_${arch}/${NVHPC_SHORT_VERSION}/compilers/bin" + +if [ -x "$INSTALL_BIN_DIR/nvc" ]; then + echo "[INFO] NVHPC already installed. Skipping installer." | tee -a "$LOGFILE" +else + echo "[INFO] Running NVIDIA HPC SDK installer..." | tee -a "$LOGFILE" + cd "$NVHPC_EXTRACT_DIR" + NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install >> "$LOGFILE" 2>&1 +fi + +echo "[SUCCESS] NVIDIA HPC SDK installation completed." | tee -a "$LOGFILE" + +# Mount NVHPC locally +mkdir -p "$NVHPC_LOCAL_MOUNT" +NVHPC_INSTALL_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" +FSTAB_ENTRY="$NVHPC_INSTALL_EXPORT $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" + +if ! grep -qE "^[^#].*$NVHPC_INSTALL_EXPORT[[:space:]]+$NVHPC_LOCAL_MOUNT[[:space:]]+nfs" /etc/fstab; then + echo "[INFO] Adding NVHPC mount to /etc/fstab" | tee -a "$LOGFILE" + echo "$FSTAB_ENTRY" >> /etc/fstab +fi + +echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..." | tee -a "$LOGFILE" +mount "$NVHPC_LOCAL_MOUNT" >> "$LOGFILE" 2>&1 +echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT" | tee -a "$LOGFILE" +echo "CLOUD-INIT: NVIDIA HPC SDK installation completed successfully" | tee -a "$LOGFILE" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 new file mode 100644 index 0000000000..9cd0d8d1a4 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 @@ -0,0 +1,133 @@ +#!/bin/bash +set -e + +CLIENT_MOUNT="{{ client_mount_path }}" +OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi" +OPENMPI_BUILD="{{ client_mount_path }}/slurm/hpc_tools/compile/openmpi" + +# Comprehensive logging +LOGFILE="/var/log/openmpi_installation.log" + +# Redirect all output to log file +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== OpenMPI Installation Started =====" +echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')" +echo "Installation Prefix: $OPENMPI_PREFIX" +echo "Build Directory: $OPENMPI_BUILD" +echo "Log File: $LOGFILE" | tee -a "$LOGFILE" + +# Check that NFS is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[ERROR] $CLIENT_MOUNT is not mounted." + echo " Please mount the NFS path before running install_openmpi.sh" + exit 1 +fi + +echo "===== OpenMPI build started =====" + +mkdir -p "$OPENMPI_BUILD" +cd "$OPENMPI_BUILD" + +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; + *) + echo "Unsupported architecture: ${sys_arch}" + exit 1 + ;; +esac + +if [ ! -f openmpi.tar.gz ]; then + echo "[INFO] Downloading OpenMPI source code..." + wget --no-check-certificate \ + https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/${arch}/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \ + -O openmpi.tar.gz >> "$LOGFILE" 2>&1 + echo "[INFO] OpenMPI download completed" +else + echo "[INFO] openmpi.tar.gz already exists, skipping download." +fi + +echo "[INFO] Extracting OpenMPI source code..." +tar xzf openmpi.tar.gz >> "$LOGFILE" 2>&1 +cd openmpi-* +echo "[INFO] OpenMPI source extracted to $(pwd)" + +echo "[INFO] Creating build directory..." +mkdir -p build + +# Slurm detection +echo "[INFO] Detecting Slurm integration..." +if sinfo >/dev/null 2>&1; then + SLURM_FLAG="--with-slurm=yes --with-munge=/usr" + echo "[INFO] Slurm detected - enabling Slurm integration" +else + SLURM_FLAG="--with-slurm=no" + echo "[INFO] Slurm not detected - disabling Slurm integration" +fi + +# UCX detection +echo "[INFO] Detecting UCX integration..." +if [ -x "{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx/bin/ucx_info" ]; then + UCX_FLAG="--with-ucx={{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" + echo "[INFO] UCX detected - enabling UCX integration" +else + UCX_FLAG="" + echo "[INFO] UCX not detected - proceeding without UCX" +fi + +cd build +echo "[INFO] Configuring OpenMPI build..." +echo "[INFO] Configure flags: --prefix=$OPENMPI_PREFIX --enable-mpi1-compatibility --enable-prte-prefix-by-default $SLURM_FLAG $UCX_FLAG" +../configure --prefix="$OPENMPI_PREFIX" \ + --enable-mpi1-compatibility \ + --enable-prte-prefix-by-default \ + $SLURM_FLAG $UCX_FLAG >> "$LOGFILE" 2>&1 + +echo "[INFO] Building OpenMPI with {{ openmpi_build_threads | default(8) }} threads..." +make -j {{ openmpi_build_threads | default(8) }} >> "$LOGFILE" 2>&1 + +echo "[INFO] Installing OpenMPI..." +make install >> "$LOGFILE" 2>&1 + +# Configure OpenMPI environment variables system-wide +OPENMPI_ENV_FILE="/etc/profile.d/openmpi.sh" + +echo "[INFO] Setting up OpenMPI environment variables in $OPENMPI_ENV_FILE..." +cat > "$OPENMPI_ENV_FILE" <> "$LOGFILE" 2>&1 + echo "[INFO] UCX download completed" +else + echo "[INFO] ucx.tar.gz already exists, skipping download." +fi + +echo "[INFO] Extracting UCX source code..." +tar xzf ucx.tar.gz >> "$LOGFILE" 2>&1 +cd ucx-* +echo "[INFO] UCX source extracted to $(pwd)" + +echo "[INFO] Creating build directory..." +mkdir -p build +cd build + +echo "[INFO] Configuring UCX build..." +../contrib/configure-release --prefix="$UCX_PREFIX" >> "$LOGFILE" 2>&1 + +echo "[INFO] Building UCX with {{ ucx_build_threads | default(8) }} threads..." +make -j {{ ucx_build_threads | default(8) }} >> "$LOGFILE" 2>&1 + +echo "[INFO] Installing UCX..." +make install >> "$LOGFILE" 2>&1 + +# Configure UCX environment variables system-wide +UCX_ENV_FILE="/etc/profile.d/ucx.sh" + +echo "[INFO] Setting up UCX environment variables in $UCX_ENV_FILE..." +cat > "$UCX_ENV_FILE" < >(tee -a "$LOGFILE") 2>&1 + +echo "===== NVHPC SDK setup (mount + wait) =====" + +PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" +PARENT_MOUNT="/shared-nvhpc-sdk" + +NVHPC_NFS_SHARE="$PARENT_MOUNT/nvhpc" +NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" + +mkdir -p "$PARENT_MOUNT" + + +if ! mountpoint -q "$PARENT_MOUNT"; then + mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT" +fi + +if ! mountpoint -q "$PARENT_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC parent export" + exit 1 +fi + +echo "[INFO] Parent NVHPC export mounted" + +mkdir -p "$NVHPC_NFS_SHARE" +# 3. Ensure fstab entry exists (bind mount, NOT NFS) +if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then + echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT none bind,_netdev 0 0" >> /etc/fstab + echo "[INFO] NVHPC bind-mount fstab entry added" +else + echo "[INFO] NVHPC fstab entry already present" +fi + +# 4. Mount NVHPC SDK +mkdir -p "$NVHPC_LOCAL_MOUNT" + +if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + mount --bind "$NVHPC_NFS_SHARE" "$NVHPC_LOCAL_MOUNT" +fi + +if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC SDK" + exit 1 +fi + +echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT" +echo "===== NVHPC setup completed =====" diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index 7f75daa01d..053ee15c0d 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -108,3 +108,4 @@ cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cud # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" software_config_file_path: "{{ input_project_dir }}/software_config.json" +local_repo_config_path: "{{ input_project_dir }}/local_repo_config.yml" diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml index 43e7d3fc63..bd046032bc 100644 --- a/discovery/roles/discovery_validations/tasks/update_hosts.yml +++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml @@ -13,16 +13,25 @@ # limitations under the License. --- -- name: Add hosts file entry for cluster +- name: Ensure 127.0.0.1 localhost entry exists ansible.builtin.shell: | set -o pipefail - grep -qxF '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' {{ hosts_file_path }} || \ - echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} + grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} + changed_when: true + +- name: Remove stale entries for IPs and hostnames that are being updated + ansible.builtin.shell: | + set -o pipefail + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ + grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp + cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} + rm -f {{ hosts_file_path }}.tmp changed_when: true loop: "{{ read_mapping_file.dict | dict2items }}" -- name: Ensure 127.0.0.1 localhost entry exists uniquely using echo +- name: Add hosts file entry for cluster ansible.builtin.shell: | set -o pipefail - grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} + echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index 433b8e9f76..601cc07097 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -78,19 +78,10 @@ packages_base_dir_aarch64: "{{ k8s_client_mount_path }}/packages/aarch64" offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" packages_layout_x86_64: - - doca-ofed - cuda packages_layout_aarch64: - - doca-ofed - cuda print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" -offline_path_x86_64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" -offline_path_aarch64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" - +offline_path_x86_64: [] +offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa diff --git a/discovery/roles/nfs_client/tasks/nfs_client.yml b/discovery/roles/nfs_client/tasks/nfs_client.yml index 079933c26b..ca8a3c7660 100644 --- a/discovery/roles/nfs_client/tasks/nfs_client.yml +++ b/discovery/roles/nfs_client/tasks/nfs_client.yml @@ -32,11 +32,6 @@ nfs_server_ip: "{{ hostvars['127.0.0.1']['admin_nic_ip'] }}" when: item.server_ip == "localhost" -- name: Package installation for NFS - ansible.builtin.package: - name: "{{ nfs_packages[ansible_os_family] }}" - state: present - - name: Mount facts items to dict ansible.builtin.set_fact: nfs_src: "{{ nfs_server_ip }}:{{ item.server_share_path }}" diff --git a/discovery/roles/nfs_client/vars/main.yml b/discovery/roles/nfs_client/vars/main.yml index b5e01fd82a..a3c20c054c 100644 --- a/discovery/roles/nfs_client/vars/main.yml +++ b/discovery/roles/nfs_client/vars/main.yml @@ -20,13 +20,6 @@ software_config_file: "{{ hostvars['localhost']['input_project_dir'] }}/software # Usage: nfs_client.yml mounted_dir_perm: "0755" default_client_mount_options: "nosuid,rw,sync,hard,intr" -nfs_packages: - RedHat: - - nfs-utils - - nfs4-acl-tools - Debian: - - nfs-common - - nfs4-acl-tools slurm_nfs_fail_msg: "Failed to mount NFS share. Please check if the NFS server is reachable or NFS is configured properly." omnia_config_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index 03ea48760c..955e4c2a37 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -25,231 +25,7 @@ default_corespersocket: 1 share_prefix: "/" conf_path_items: {} conf_dict_items: {} -# This validates just the keys and not the values, as native support for this schema is not available from slurm -__conf_keys: - slurm: -# updated from version 22.08 using cmd -> scontrol show config - - AccountingStorageBackupHost - - AccountingStorageEnforce - - AccountingStorageHost - - AccountingStorageExternalHost - - AccountingStorageParameters - - AccountingStoragePort - - AccountingStorageTRES - - AccountingStorageType - - AccountingStorageUser - - AccountingStoreFlags - - AcctGatherEnergyType - - AcctGatherFilesystemType - - AcctGatherInterconnectType - - AcctGatherNodeFreq - - AcctGatherProfileType - - AllowSpecResourcesUsage - - AuthAltTypes - - AuthAltParameters - - AuthInfo - - AuthType - - BatchStartTimeout - - BcastExclude - - BcastParameters - - BurstBufferType - - CliFilterPlugins - # - ClusterName # This will be set from the input "omnia_config.yml" - - CommunicationParameters - - CompleteWait - - CoreSpecPlugin - - CpuFreqDef - - CpuFreqGovernors - - CredType - - DebugFlags - - DefMemPerNode - - DependencyParameters - - DisableRootJobs - - EioTimeout - - EnforcePartLimits - - Epilog - - EpilogMsgTime - - EpilogSlurmctld - - ExtSensorsType - - ExtSensorsFreq - - FederationParameters - - FirstJobId - - GetEnvTimeout - - GresTypes - - GpuFreqDef - - GroupUpdateForce - - GroupUpdateTime - - HealthCheckInterval - - HealthCheckNodeState - - HealthCheckProgram - - InactiveLimit - - InteractiveStepOptions - - JobAcctGatherFrequency - - JobAcctGatherType - - JobAcctGatherParams - - JobCompHost - - JobCompLoc - - JobCompPort - - JobCompType - - JobCompUser - - JobContainerType - - JobCredentialPrivateKey - - JobCredentialPublicCertificate - - JobDefaults - - JobFileAppend - - JobRequeue - - JobSubmitPlugins - - KillOnBadExit - - KillWait - - LaunchParameters - - LaunchType - - Licenses - - LogTimeFormat - - MailDomain - - MailProg - - MaxArraySize - - MaxDBDMsgs - - MaxJobCount - - MaxJobId - - MaxMemPerNode - - MaxNodeCount - - MaxStepCount - - MaxTasksPerNode - - MCSPlugin - - MCSParameters - - MessageTimeout - - MinJobAge - - MpiDefault - - MpiParams - - NodeFeaturesPlugins - - OverTimeLimit - - PluginDir - - PlugStackConfig - - PowerParameters - - PowerPlugin - - PreemptType - - PreemptExemptTime - - PrEpParameters - - PrEpPlugins - - PriorityParameters - - PrioritySiteFactorParameters - - PrioritySiteFactorPlugin - - PriorityType - - PrivateData - - ProctrackType - - Prolog - - PrologEpilogTimeout - - PrologSlurmctld - - PrologFlags - - PropagatePrioProcess - - PropagateResourceLimits - - PropagateResourceLimitsExcept - - RebootProgram - - ReconfigFlags - - RequeueExit - - RequeueExitHold - - ResumeFailProgram - - ResumeProgram - - ResumeRate - - ResumeTimeout - - ResvEpilog - - ResvOverRun - - ResvProlog - - ReturnToService - - RoutePlugin - - SchedulerParameters - - SchedulerTimeSlice - - SchedulerType - - ScronParameters - - SelectType - - SelectTypeParameters - - SlurmUser - - SlurmctldAddr - - SlurmctldDebug - - SlurmctldLogFile - - SlurmctldPort - - SlurmctldSyslogDebug - - SlurmctldPrimaryOffProg - - SlurmctldPrimaryOnProg - - SlurmctldTimeout - - SlurmctldParameters - - SlurmdDebug - - SlurmdLogFile - - SlurmdParameters - - SlurmdPidFile - - SlurmdPort - - SlurmdSpoolDir - - SlurmdSyslogDebug - - SlurmdTimeout - - SlurmdUser - - SlurmSchedLogFile - - SlurmSchedLogLevel - - SlurmctldPidFile - - SlurmctldPlugstack - - SrunEpilog - - SrunPortRange - - SrunProlog - - StateSaveLocation - - SuspendExcNodes - - SuspendExcParts - - SuspendProgram - - SuspendRate - - SuspendTime - - SuspendTimeout - - SwitchParameters - - SwitchType - - TaskEpilog - - TaskPlugin - - TaskPluginParam - - TaskProlog - - TCPTimeout - - TmpFS - - TopologyParam - - TopologyPlugin - - TrackWCKey - - TreeWidth - - UsePam - - UnkillableStepProgram - - UnkillableStepTimeout - - VSizeFactor - - WaitTime - - X11Parameters - mpi: - - PMIxCliTmpDirBase - - PMIxCollFence - - PMIxDebug - - PMIxDirectConn - - PMIxDirectConnEarly - - PMIxDirectConnUCX - - PMIxDirectSameArch - - PMIxEnv - - PMIxFenceBarrier - - PMIxNetDevicesUCX - - PMIxShareServerTopology - - PMIxTimeout - - PMIxTlsUCX - cgroup: - - CgroupMountpoint - - CgroupPlugin - - CgroupSlice - - SystemdTimeout - - IgnoreSystemd - - IgnoreSystemdOnFailure - - EnableControllers - - EnableExtraControllers - - AllowedRAMSpace - - AllowedSwapSpace - - ConstrainCores - - ConstrainDevices - - ConstrainRAMSpace - - ConstrainSwapSpace - - MaxRAMPercent - - MaxSwapPercent - - MemorySwappiness - - MinRAMSpace - - SignalChildrenProcesses - slurmdbd: {} - gres: {} + __default_config: cgroup: # CgroupAutomount: true @@ -291,7 +67,6 @@ __default_config: PartitionName: - PartitionName: DEFAULT Nodes: ALL - Default: true MaxTime: INFINITE State: UP # S_P_ARRAY type paramater to be provided this way @@ -311,3 +86,10 @@ __default_config: DbdPort: "{{ slurm_dbd_port }}" gres: AutoDetect: nvml + acct_gather: {} + helpers: {} + job_container: {} + mpi: {} + oci: {} + topology: {} + burst_buffer: {} diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index cd72cf33f0..40b6137172 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -12,12 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Read NodeName parameters from iDRAC + ansible.builtin.include_tasks: read_node_idrac.yml + when: cmpt_list + loop: "{{ cmpt_list }}" + - name: Append node_params list into NodeName list ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + (node_params | default([]))}))}) }}" when: node_params is defined and node_params + no_log: true - name: Append login nodes to NodeName list ansible.builtin.set_fact: @@ -26,6 +32,7 @@ | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" loop: "{{ login_list }}" when: login_list is defined and login_list + no_log: true - name: Append compiler login nodes to NodeName list ansible.builtin.set_fact: @@ -34,6 +41,7 @@ | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" loop: "{{ compiler_login_list }}" when: compiler_login_list is defined and compiler_login_list + no_log: true - name: Append Partition ansible.builtin.set_fact: @@ -41,13 +49,16 @@ | combine({'slurm': (apply_config['slurm'] | combine({'PartitionName': (apply_config['slurm'].PartitionName | default([])) + [partition_params]}))}) }}" when: node_params is defined and node_params + no_log: true - name: Add gpu parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" when: gpu_params is defined and gpu_params + no_log: true - name: Add dbd parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}" when: dbd_list is defined and dbd_list + no_log: true diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 5f89e051b8..ce27d3c362 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -12,50 +12,90 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Initialize ctld_state dict - ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}" - - name: Check if remote host is reachable via SSH ansible.builtin.wait_for: - host: "{{ item }}" + host: "{{ ctld }}" port: 22 # TODO: make it configurable timeout: 10 state: started delegate_to: localhost register: ssh_check + ignore_errors: true -- name: Check if slurmctld is running on remote host - ansible.builtin.service_facts: - delegate_to: "{{ item }}" - register: service_facts - when: ssh_check is success +- name: Drain and remove nodes if any + ansible.builtin.include_tasks: drain_and_remove_node.yml + loop: "{{ nodes_in_normal_not_in_cmpt }}" + loop_control: + loop_var: node_to_remove + when: + - ssh_check is success + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 -- name: Update ctld_state if slurmctld is running - ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | combine({item: true}) }}" +- name: Enter slurm controller when pingable when: - ssh_check is success - - service_facts is success - - ansible_facts.services['slurmctld.service'] is defined - - ansible_facts.services['slurmctld.service'].state == 'running' - -- name: Update /etc/hosts with controller hostname and IP - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '^{{ ip.value }}\s+{{ ip.key }}' - line: "{{ ip.value }} {{ ip.key }}" - state: present - loop: "{{ ip_name_map | dict2items }}" - loop_control: - loop_var: ip - delegate_to: "{{ item }}" - when: ssh_check is success - -- name: Trigger the scontrol reconfigure - ansible.builtin.command: scontrol reconfigure - changed_when: scontrol_reconfig.rc == 0 - failed_when: false - register: scontrol_reconfig - delegate_to: "{{ item }}" - when: ctld_state[item] is true + ignore_unreachable: true + block: + - name: Initialize ctld_state dict + ansible.builtin.set_fact: + ctld_state: "{{ ctld_state | default({}) | combine({ctld: false}) }}" + + - name: Check if slurmctld is running on remote host + ansible.builtin.service_facts: + delegate_to: "{{ ctld }}" + register: service_facts + ignore_unreachable: true + + - name: Check slurmctld is reachable + ansible.builtin.fail: + msg: "Failed to connect to {{ ctld }}." + when: service_facts is unreachable + + - name: Update ctld_state if slurmctld is running + ansible.builtin.set_fact: + ctld_state: "{{ ctld_state | combine({ctld: true}) }}" + when: + - service_facts is success + - ansible_facts.services['slurmctld.service'] is defined + - ansible_facts.services['slurmctld.service'].state == 'running' + + - name: Check reachability of hosts in ip_name_map + ansible.builtin.wait_for: + host: "{{ host }}" + port: 22 + timeout: 10 + state: started + delegate_to: localhost + loop: "{{ ip_name_map.values() | list }}" + loop_control: + loop_var: host + register: ip_map_ssh_check + ignore_errors: true + ignore_unreachable: true + + - name: Build list of reachable hosts from ip_name_map + ansible.builtin.set_fact: + reachable_hosts: "{{ ip_map_ssh_check.results | rejectattr('failed', 'true') | map(attribute='host') | list }}" + + - name: Update basics on reachable_hosts + ansible.builtin.include_tasks: update_hosts_munge.yml + loop: "{{ reachable_hosts }}" + loop_control: + loop_var: slurmhost_ip + + - name: Trigger the scontrol reconfigure + ansible.builtin.command: scontrol reconfigure + changed_when: scontrol_reconfig.rc == 0 + failed_when: false + register: scontrol_reconfig + delegate_to: "{{ ctld }}" + when: + - ctld_state[ctld] is true + + rescue: + - name: Fail if slurmctld is not running on any host + ansible.builtin.debug: + msg: "Failed to 'scontrol reconfigure' on {{ ctld }}. + As task '{{ ansible_failed_task.name }}' failed. + results: {{ ansible_failed_result }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index f3228fa460..1e5a4e507e 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -15,20 +15,26 @@ - name: Slurm dict ops ansible.builtin.set_fact: apply_config: "{{ __default_config }}" + no_log: true -- name: Read NodeName parameters - ansible.builtin.include_tasks: read_node_idrac.yml - when: cmpt_list - loop: "{{ cmpt_list }}" +- name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true) + ansible.builtin.set_fact: + conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}" + when: + - skip_merge | default(false) + - configs_input is defined - name: Build slurm.conf ansible.builtin.include_tasks: build_slurm_conf.yml + when: "'slurm' in conf_files" - name: Slurm dbd opts ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) - | combine({'slurmdbd': (apply_config['slurmdbd'] | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}" + | combine({'slurmdbd': (apply_config['slurmdbd'] + | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}" when: ctld_list + no_log: true - name: Check .conf files existence ansible.builtin.stat: @@ -37,28 +43,73 @@ loop: "{{ ctld_list | product(conf_files | default([])) }}" register: ctld_conf_files +- name: Parse configs_input files from localhost (if they are paths) + slurm_conf: + op: parse + conf_name: "{{ item.key }}" + path: "{{ item.value }}" + delegate_to: localhost + loop: "{{ configs_input | default({}) | dict2items }}" + register: parsed_configs_input_results + no_log: true + when: + - configs_input is defined + - configs_input + - item.value is string + - item.key in conf_files + +- name: Build parsed_configs_input dictionary from parsed files + ansible.builtin.set_fact: + parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.item.key: item.conf_dict}) }}" + loop: "{{ parsed_configs_input_results.results }}" + no_log: true + when: + - parsed_configs_input_results is defined + - not item.skipped | default(false) + +- name: Add configs_input dicts that are already parsed + ansible.builtin.set_fact: + parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.key: item.value}) }}" + loop: "{{ configs_input | default({}) | dict2items }}" + no_log: true + when: + - configs_input is defined + - configs_input + - item.value is mapping + - name: Create lists for conf_merge ansible.builtin.set_fact: conf_merge_dict: "{{ conf_merge_dict | default({}) | combine({ - conf_set.item.1: ( - [apply_config[conf_set.item.1]] - + ([conf_set.stat.path] if conf_set.stat.exists else []) - + ([configs_input.get(conf_set.item.1)] if configs_input.get(conf_set.item.1) else []) + existing_conf_set.item.1: ( + [apply_config[existing_conf_set.item.1]] + + ([existing_conf_set.stat.path] if existing_conf_set.stat.exists else []) + + ([parsed_configs_input.get(existing_conf_set.item.1)] + if parsed_configs_input is defined and parsed_configs_input.get(existing_conf_set.item.1) else []) ) }) }}" loop: "{{ ctld_conf_files.results }}" loop_control: - loop_var: conf_set + loop_var: existing_conf_set register: prepared_conf_lists + no_log: true +# All the updates to the confs follow after this point before merge - name: Prepend ClusterName and SlurmctldHost to slurm conf sources ansible.builtin.set_fact: # TODO: Change order if needed conf_merge_dict: "{{ conf_merge_dict - | combine({'slurm': [{'ClusterName': cluster_name, 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}" + | combine({'slurm': [{'ClusterName': cluster_name, 'AccountingStorageHost': dbd_list[0], 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}" when: "'slurm' in conf_merge_dict" + no_log: true + +- name: Slurm dbd - DbdHost and StorageHost + ansible.builtin.set_fact: + conf_merge_dict: "{{ conf_merge_dict + | combine({'slurmdbd': [{'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}] + conf_merge_dict['slurmdbd']}) }}" + when: "'slurmdbd' in conf_merge_dict" + no_log: true - name: Merge the confs slurm_conf: @@ -67,12 +118,19 @@ conf_name: "{{ item.key }}" loop: "{{ conf_merge_dict | dict2items }}" register: merged_conf + no_log: true - name: Update slurm_conf_dict with merged configuration for cloud_init read. # TODO: Remove cloud init dependency ansible.builtin.set_fact: slurm_conf_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurm') | first).conf_dict }}" when: "'slurm' in conf_merge_dict" +- name: Extract effective path parameters from merged configs + ansible.builtin.include_tasks: extract_path_overrides.yml + +- name: Validate path parameters are absolute + ansible.builtin.include_tasks: validate_path_overrides.yml + - name: Get nodes from normal partition and compare with cmpt_list ansible.builtin.set_fact: normal_partition: "{{ slurm_conf_dict.PartitionName | default([]) | selectattr('PartitionName', 'equalto', slurm_partition_name) | first | default({}) }}" @@ -95,52 +153,47 @@ - nodes_in_normal_not_in_cmpt is defined - nodes_in_normal_not_in_cmpt | length > 0 -- name: Create directories from conf values +- name: Create directories from conf values (NFS server-side always uses defaults) ansible.builtin.include_tasks: exist_dir.yml loop: - "{{ ctld_list - | product([slurm_conf_dict.get('StateSaveLocation', '/var/spool/slurmctld'), - (slurm_conf_dict.get('SlurmctldLogFile', '/var/log/slurmctld.log') | dirname), - (slurm_conf_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid') | dirname)]) }}" + | product(['/var/spool/slurmctld', + '/var/log/slurm', + '/var/run']) }}" - "{{ (cmpt_list + login_list + compiler_login_list) - | product([slurm_conf_dict.get('SlurmdSpoolDir', '/var/spool/slurmd'), - (slurm_conf_dict.get('SlurmdLogFile', '/var/log/slurmd.log') | dirname), - (slurm_conf_dict.get('SlurmdPidFile', '/var/run/slurmd.pid') | dirname)]) }}" + | product(['/var/spool/slurmd', + '/var/log/slurm', + '/var/run']) }}" loop_control: loop_var: product -- name: Create backup directory with timestamp - ansible.builtin.file: - path: "{{ backup_dir }}" - state: directory - mode: '0755' - owner: "{{ slurm_user }}" - group: "{{ slurm_user_group }}" - when: ctld_list - -- name: Backup existing SLURM configuration files with timestamp - ansible.builtin.copy: - src: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - dest: "{{ backup_dir }}/{{ item.item.key }}.conf" - remote_src: true - mode: preserve - loop: "{{ merged_conf.results }}" - when: - - ctld_list - - item.item.key in conf_files - register: backup_results - failed_when: false +- name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd + ansible.builtin.set_fact: + conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}" + when: slurm_conf_dict is defined - name: Write merged .conf ansible.builtin.copy: content: "{{ item.ini_lines | join('\n') }}\n" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - mode: "{{ conf_file_mode }}" + mode: "{{ slurm_dbd_mode if item.item.key == 'slurmdbd' else slurm_mode }}" owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" loop: "{{ merged_conf.results }}" register: ctld_conf_files + no_log: true + when: + - item.ini_lines + +- name: Add extra confs which are not handled + ansible.builtin.include_tasks: handle_extra_confs.yml + when: + - configs_input is defined + - configs_input.keys() | difference(conf_files) | length > 0 + loop: "{{ configs_input.keys() | difference(conf_files) }}" + loop_control: + loop_var: extra_conf - name: Check if cluster running ansible.builtin.include_tasks: check_ctld_running.yml @@ -148,3 +201,5 @@ - ctld_list - ctld_conf_files is changed loop: "{{ ctld_list }}" + loop_control: + loop_var: ctld diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 9ce43dcd6a..b68bcbbded 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -60,6 +60,7 @@ ansible.builtin.set_fact: cluster_name: "{{ slurm_cluster[0].cluster_name }}" configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}" + skip_merge: "{{ slurm_cluster[0].skip_merge | default(false) }}" slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" controller_trackfile_path: "{{ share_path }}/ctld_track" @@ -84,11 +85,21 @@ share_prefix: "{{ slurm_config_path }}" when: conf_in_nfs -- name: Clear the share directory +- name: Clear Slurm-related files and directories ansible.builtin.file: - path: "{{ slurm_config_path }}" + path: "{{ slurm_config_path }}/{{ slurm_item }}" state: absent - when: clear_slurm_files + loop: "{{ (ctld_list | default([]) + + cmpt_list | default([]) + + login_list | default([]) + + compiler_login_list | default([]) + + dbd_list | default([]) + + ['munge.key']) | flatten }}" + loop_control: + loop_var: slurm_item + failed_when: false + when: + - clear_slurm_files - name: Create the slurm directory in share ansible.builtin.file: @@ -114,7 +125,7 @@ - "{{ (ctld_list + cmpt_list + login_list + compiler_login_list) | product(common_dir) }}" - "{{ ctld_list | product(ctld_dir) }}" - "{{ dbd_list | product(db_dir) }}" - - "{{ cmpt_list | product(cmpt_dir) }}" + - "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}" loop_control: loop_var: product @@ -151,8 +162,9 @@ ansible.builtin.copy: src: "{{ slurm_config_path }}/munge.key" dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key" - mode: "{{ common_mode }}" + mode: "0600" remote_src: true + register: munge_key_copy loop: "{{ (ctld_list | default([])) + (cmpt_list | default([])) + (compiler_login_list | default([])) + @@ -183,17 +195,7 @@ group: "{{ root_group }}" mode: "{{ common_mode }}" when: cmpt_list - loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}" - -- name: Create logout_user.sh and slurmd.service in login and login_compiler - ansible.builtin.template: - src: "{{ item.1 }}.j2" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" - when: login_list or compiler_login_list - loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}" + loop: "{{ cmpt_list | product(['logout_user.sh']) }}" - name: Get the slurm NFS path ansible.builtin.debug: diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml new file mode 100644 index 0000000000..da1c41d3fe --- /dev/null +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -0,0 +1,109 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Check if node exists in Slurm cluster + ansible.builtin.command: scontrol show node {{ node_to_remove }} + register: node_exists_check + failed_when: false + ignore_unreachable: true + changed_when: false + delegate_to: "{{ ctld }}" + +- name: Skip if node does not exist + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} not found in cluster, skipping removal" + when: + - node_exists_check is reachable + - node_exists_check.rc != 0 + +- name: Process node removal + when: + - node_exists_check is reachable + - node_exists_check.rc == 0 + ignore_unreachable: true + block: + - name: Get current job count on node + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l + register: current_jobs + changed_when: false + delegate_to: "{{ ctld }}" + + - name: Display job information + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)" + + - name: Drain the node to prevent new job assignments + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DRAIN + Reason="Scheduled removal - waiting for jobs to complete" + changed_when: true + delegate_to: "{{ ctld }}" + + - name: Wait for all jobs to complete on the node + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l + register: job_count_check + until: job_count_check.stdout | int == 0 + retries: "{{ (node_drain_timeout / node_drain_delay) | int }}" + delay: "{{ node_drain_delay }}" + changed_when: false + delegate_to: "{{ ctld }}" + when: current_jobs.stdout | int > 0 + + - name: Confirm jobs completed + ansible.builtin.debug: + msg: "All jobs on {{ node_to_remove }} have completed" + when: current_jobs.stdout | int > 0 + + - name: Log node removal + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state" + + rescue: + - name: Log node removal failure + ansible.builtin.debug: + msg: "Failed to drain node {{ node_to_remove }}" + + - name: Remove slurm node with running job after timeout + ansible.builtin.pause: + prompt: | + Node {{ node_to_remove }} has been DRAINED to prevent new job assignments. + Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds. + Options: + 1. Press Ctrl+C then 'A' to abort + 2. Press Enter to force removal (jobs will be killed) + when: not force_scancel_node + + - name: Force cancel jobs if timeout reached + ansible.builtin.command: scancel -f -w {{ node_to_remove }} + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + + always: + - name: Set node to DOWN state + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DOWN + Reason="Node removed from cluster" + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + when: node_exists_check.rc == 0 diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml new file mode 100644 index 0000000000..0efcf18962 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml @@ -0,0 +1,222 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ── Extract merged dicts ────────────────────────────────────────────── + +- name: Extract slurm.conf merged dict + ansible.builtin.set_fact: + slurm_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurm') | first).conf_dict }}" + when: "'slurm' in conf_merge_dict" + +- name: Extract slurmdbd.conf merged dict + ansible.builtin.set_fact: + slurmdbd_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurmdbd') | first).conf_dict }}" + when: "'slurmdbd' in conf_merge_dict" + no_log: true + +- name: Extract cgroup.conf merged dict + ansible.builtin.set_fact: + cgroup_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'cgroup') | first).conf_dict }}" + when: "'cgroup' in conf_merge_dict" + +# ── slurm.conf: controller path params ──────────────────────────────── + +- name: Extract effective controller directories from slurm.conf + ansible.builtin.set_fact: + slurm_ctld_log_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmctldLogFile', ['/var/log/slurm/slurmctld.log']) + | first if slurm_merged_dict.get('SlurmctldLogFile') is iterable + and slurm_merged_dict.get('SlurmctldLogFile') is not string + else slurm_merged_dict.get('SlurmctldLogFile', '/var/log/slurm/slurmctld.log')) + | dirname }} + slurm_state_save_location_effective: >- + {{ (slurm_merged_dict.get('StateSaveLocation', ['/var/spool/slurmctld']) + | first if slurm_merged_dict.get('StateSaveLocation') is iterable + and slurm_merged_dict.get('StateSaveLocation') is not string + else slurm_merged_dict.get('StateSaveLocation', '/var/spool/slurmctld')) }} + slurm_ctld_pid_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmctldPidFile', ['/var/run/slurmctld.pid']) + | first if slurm_merged_dict.get('SlurmctldPidFile') is iterable + and slurm_merged_dict.get('SlurmctldPidFile') is not string + else slurm_merged_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid')) + | dirname }} + slurm_sched_log_dir_effective: >- + {{ ((slurm_merged_dict.get('SlurmSchedLogFile', ['']) + | first if slurm_merged_dict.get('SlurmSchedLogFile') is iterable + and slurm_merged_dict.get('SlurmSchedLogFile') is not string + else slurm_merged_dict.get('SlurmSchedLogFile', '')) + | default('', true) | dirname | default('', true)) }} + when: slurm_merged_dict is defined + +# ── slurm.conf: compute path params ────────────────────────────────── + +- name: Extract effective compute directories from slurm.conf + ansible.builtin.set_fact: + slurm_slurmd_log_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmdLogFile', ['/var/log/slurm/slurmd.log']) + | first if slurm_merged_dict.get('SlurmdLogFile') is iterable + and slurm_merged_dict.get('SlurmdLogFile') is not string + else slurm_merged_dict.get('SlurmdLogFile', '/var/log/slurm/slurmd.log')) + | dirname }} + slurm_slurmd_spool_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmdSpoolDir', ['/var/spool/slurmd']) + | first if slurm_merged_dict.get('SlurmdSpoolDir') is iterable + and slurm_merged_dict.get('SlurmdSpoolDir') is not string + else slurm_merged_dict.get('SlurmdSpoolDir', '/var/spool/slurmd')) }} + slurm_slurmd_pid_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmdPidFile', ['/var/run/slurmd.pid']) + | first if slurm_merged_dict.get('SlurmdPidFile') is iterable + and slurm_merged_dict.get('SlurmdPidFile') is not string + else slurm_merged_dict.get('SlurmdPidFile', '/var/run/slurmd.pid')) + | dirname }} + slurm_epilog_dir_effective: >- + {{ (slurm_merged_dict.get('Epilog', ['/etc/slurm/epilog.d/logout_user.sh']) + | first if slurm_merged_dict.get('Epilog') is iterable + and slurm_merged_dict.get('Epilog') is not string + else slurm_merged_dict.get('Epilog', '/etc/slurm/epilog.d/logout_user.sh')) + | dirname }} + slurm_prolog_dir_effective: >- + {{ ((slurm_merged_dict.get('Prolog', ['']) + | first if slurm_merged_dict.get('Prolog') is iterable + and slurm_merged_dict.get('Prolog') is not string + else slurm_merged_dict.get('Prolog', '')) + | default('', true) | dirname | default('', true)) }} + when: slurm_merged_dict is defined + +# ── slurm.conf: all epilog/prolog dirs and custom file paths ───────── + +- name: Extract all epilog paths from merged Epilog list + ansible.builtin.set_fact: + slurm_epilog_paths_all: >- + {{ (slurm_merged_dict.get('Epilog', []) + if slurm_merged_dict.get('Epilog') is iterable + and slurm_merged_dict.get('Epilog') is not string + else [slurm_merged_dict.get('Epilog', '')]) + | reject('equalto', '') | list }} + slurm_epilog_dirs_all: >- + {{ (slurm_merged_dict.get('Epilog', []) + if slurm_merged_dict.get('Epilog') is iterable + and slurm_merged_dict.get('Epilog') is not string + else [slurm_merged_dict.get('Epilog', '')]) + | map('dirname') | unique | reject('equalto', '') | list }} + when: slurm_merged_dict is defined + +- name: Extract custom epilog paths (non-default) + ansible.builtin.set_fact: + slurm_epilog_custom_paths: >- + {{ slurm_epilog_paths_all | reject('search', '^/etc/slurm/epilog\\.d/') | list }} + when: slurm_merged_dict is defined + +- name: Extract all prolog paths from merged Prolog list + ansible.builtin.set_fact: + slurm_prolog_paths_all: >- + {{ (slurm_merged_dict.get('Prolog', []) + if slurm_merged_dict.get('Prolog') is iterable + and slurm_merged_dict.get('Prolog') is not string + else [slurm_merged_dict.get('Prolog', '')]) + | reject('equalto', '') | list }} + slurm_prolog_dirs_all: >- + {{ (slurm_merged_dict.get('Prolog', []) + if slurm_merged_dict.get('Prolog') is iterable + and slurm_merged_dict.get('Prolog') is not string + else [slurm_merged_dict.get('Prolog', '')]) + | map('dirname') | unique | reject('equalto', '') | list }} + when: slurm_merged_dict is defined + +- name: Extract custom prolog paths (non-default) + ansible.builtin.set_fact: + slurm_prolog_custom_paths: >- + {{ slurm_prolog_paths_all | list }} + when: slurm_merged_dict is defined + +# ── slurm.conf: plugin dir (both controller and compute) ───────────── + +- name: Extract effective plugin directory from slurm.conf + ansible.builtin.set_fact: + slurm_plugin_dir_effective: >- + {{ (slurm_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) + | first if slurm_merged_dict.get('PluginDir') is iterable + and slurm_merged_dict.get('PluginDir') is not string + else slurm_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }} + when: slurm_merged_dict is defined + +# ── slurmdbd.conf path params ──────────────────────────────────────── + +- name: Extract effective directories from slurmdbd.conf + ansible.builtin.set_fact: + slurmdbd_log_dir_effective: >- + {{ (slurmdbd_merged_dict.get('LogFile', ['/var/log/slurm/slurmdbd.log']) + | first if slurmdbd_merged_dict.get('LogFile') is iterable + and slurmdbd_merged_dict.get('LogFile') is not string + else slurmdbd_merged_dict.get('LogFile', '/var/log/slurm/slurmdbd.log')) + | dirname }} + slurmdbd_pid_dir_effective: >- + {{ (slurmdbd_merged_dict.get('PidFile', ['/var/run/slurmdbd.pid']) + | first if slurmdbd_merged_dict.get('PidFile') is iterable + and slurmdbd_merged_dict.get('PidFile') is not string + else slurmdbd_merged_dict.get('PidFile', '/var/run/slurmdbd.pid')) + | dirname }} + slurmdbd_plugin_dir_effective: >- + {{ (slurmdbd_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) + | first if slurmdbd_merged_dict.get('PluginDir') is iterable + and slurmdbd_merged_dict.get('PluginDir') is not string + else slurmdbd_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }} + when: slurmdbd_merged_dict is defined + +# ── cgroup.conf path params ────────────────────────────────────────── + +- name: Extract effective cgroup mountpoint from cgroup.conf + ansible.builtin.set_fact: + slurm_cgroup_mountpoint_effective: >- + {{ ((cgroup_merged_dict.get('CgroupMountpoint', ['']) + | first if cgroup_merged_dict.get('CgroupMountpoint') is iterable + and cgroup_merged_dict.get('CgroupMountpoint') is not string + else cgroup_merged_dict.get('CgroupMountpoint', '')) + | default('', true)) }} + when: cgroup_merged_dict is defined + +# ── Defaults when confs are not merged ──────────────────────────────── + +- name: Set default effective directories if slurm.conf not merged + ansible.builtin.set_fact: + slurm_ctld_log_dir_effective: "/var/log/slurm" + slurm_slurmd_log_dir_effective: "/var/log/slurm" + slurm_state_save_location_effective: "/var/spool/slurmctld" + slurm_slurmd_spool_dir_effective: "/var/spool/slurmd" + slurm_ctld_pid_dir_effective: "/var/run" + slurm_slurmd_pid_dir_effective: "/var/run" + slurm_epilog_dir_effective: "/etc/slurm/epilog.d" + slurm_prolog_dir_effective: "" + slurm_sched_log_dir_effective: "" + slurm_plugin_dir_effective: "/usr/lib64/slurm" + slurm_epilog_dirs_all: ["/etc/slurm/epilog.d"] + slurm_epilog_paths_all: ["/etc/slurm/epilog.d/logout_user.sh"] + slurm_epilog_custom_paths: [] + slurm_prolog_dirs_all: [] + slurm_prolog_paths_all: [] + slurm_prolog_custom_paths: [] + when: slurm_merged_dict is not defined + +- name: Set default effective directories if slurmdbd.conf not merged + ansible.builtin.set_fact: + slurmdbd_log_dir_effective: "/var/log/slurm" + slurmdbd_pid_dir_effective: "/var/run" + slurmdbd_plugin_dir_effective: "/usr/lib64/slurm" + when: slurmdbd_merged_dict is not defined + +- name: Set default effective cgroup mountpoint if cgroup.conf not merged + ansible.builtin.set_fact: + slurm_cgroup_mountpoint_effective: "" + when: cgroup_merged_dict is not defined diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml new file mode 100644 index 0000000000..544822ec28 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml @@ -0,0 +1,37 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Add extra confs which are not handled + slurm_conf: + op: merge + conf_sources: "{{ [configs_input[extra_conf]] }}" + conf_name: "{{ extra_conf }}" + register: ex_conf + delegate_to: localhost + no_log: true + when: + - "'.' not in extra_conf" + +- name: Write merged .conf + ansible.builtin.copy: + content: "{{ ex_conf.ini_lines | join('\n') }}\n" + dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ extra_conf }}.conf" + mode: "{{ conf_file_mode }}" + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + remote_src: "{{ copy_from_oim }}" + no_log: true + when: + - "'.' not in extra_conf" + - ex_conf is success diff --git a/discovery/roles/slurm_config/tasks/hpc_tools.yml b/discovery/roles/slurm_config/tasks/hpc_tools.yml index c8bdb5d335..46260da267 100644 --- a/discovery/roles/slurm_config/tasks/hpc_tools.yml +++ b/discovery/roles/slurm_config/tasks/hpc_tools.yml @@ -25,6 +25,7 @@ - runfile - scripts - container_images + - nvidia_sdk - name: Deploy download_container_image.sh to NFS share ansible.builtin.template: @@ -122,34 +123,27 @@ ansible.builtin.set_fact: oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" -- name: Check if source directory exists - ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_x86_64 +- name: Build parallel copy list for HPC tools + ansible.builtin.set_fact: + parallel_copy_pairs: [] -- name: Check if source directory exists +- name: Check which parallel copy source directories exist ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_aarch64 - -- name: Copy cuda run file using copy module for aarch64 - ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' - owner: root - group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir + path: "{{ item.src }}" + loop: "{{ parallel_copy_candidates }}" + register: copy_source_checks + failed_when: false -- name: Copy cuda run file using copy module for x86_64 - ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' - owner: root - group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir +- name: Add only valid copy pairs (source exists) + ansible.builtin.set_fact: + parallel_copy_pairs: >- + {{ parallel_copy_pairs + + [[ item.item.src, item.item.dest ]] }} + loop: "{{ copy_source_checks.results }}" + when: item.stat.exists + +- name: Parallel copy HPC tool files + parallel_file_copy: + copy_pairs: "{{ parallel_copy_pairs }}" + max_workers: "{{ parallel_copy_max_workers }}" + when: parallel_copy_pairs | length > 0 diff --git a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml index df19821983..0f7b3a16b2 100644 --- a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -46,6 +46,7 @@ - name: Get bmc_ip ansible.builtin.set_fact: bmc_ip_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='bmc_ip') }}" + name_ip_map: "{{ dict(ip_name_map.values() | zip(ip_name_map.keys())) }}" - name: Assign slurm lists ansible.builtin.set_fact: diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index 4dc0217559..ba93bb086a 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -30,7 +30,7 @@ - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': cmpt_list | join(',')}) if item.PartitionName == slurm_partition_name else item] }}" + + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: - "'slurm' in conf_merge_dict" diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml new file mode 100644 index 0000000000..64c36dbeaf --- /dev/null +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -0,0 +1,88 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Update /etc/hosts with controller hostname and IP + ansible.builtin.lineinfile: + path: /etc/hosts + regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}' + line: "{{ host_entry.value }} {{ host_entry.key }}" + state: present + loop: "{{ ip_name_map | dict2items | list }}" + loop_control: + loop_var: host_entry + ignore_unreachable: true + failed_when: false + delegate_to: "{{ slurmhost_ip }}" + +- name: Get munge changes + ansible.builtin.set_fact: + munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}" + when: munge_key_copy is defined + +# TODO: Clean unreachable handling +- name: Block when munge key changed + when: + - munge_key_changed is defined + - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false) + - restart_slurm_services + delegate_to: "{{ slurmhost_ip }}" + ignore_unreachable: true + block: + - name: Update munge key permissions + ansible.builtin.file: + path: /etc/munge/munge.key + owner: munge + group: munge + mode: '0600' + register: munge_key_permissions_result + + - name: Restart munge service if key changed + ansible.builtin.service: + name: munge + state: restarted + register: munge_restart_result + when: + - munge_key_permissions_result is defined + - munge_key_permissions_result is success + + - name: Restart slurmctld if munge restarted + ansible.builtin.service: + name: slurmctld + state: restarted + when: + - name_ip_map[slurmhost_ip] in ctld_list + - munge_restart_result is defined + - munge_restart_result is success + + - name: Restart slurmd if munge restarted + ansible.builtin.service: + name: slurmd + state: restarted + when: + - name_ip_map[slurmhost_ip] in (cmpt_list + login_list + compiler_login_list) + - munge_restart_result is defined + - munge_restart_result is success + + - name: Restart slurmdbd if munge restarted + ansible.builtin.service: + name: slurmdbd + state: restarted + when: + - name_ip_map[slurmhost_ip] in dbd_list + - munge_restart_result is defined + - munge_restart_result is success + rescue: + - name: Handle munge restart failure + ansible.builtin.debug: + msg: "Failed task {{ ansible_failed_task.name }} on {{ slurmhost_ip }}" diff --git a/discovery/roles/slurm_config/tasks/validate_path_overrides.yml b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml new file mode 100644 index 0000000000..c4a1783b02 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml @@ -0,0 +1,107 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ── slurm.conf path validation ─────────────────────────────────────── + +- name: Validate slurm.conf path parameters are absolute + ansible.builtin.fail: + msg: "slurm.conf {{ item }} must be an absolute path (start with /). Current value: {{ slurm_merged_dict.get(item) }}" + when: + - slurm_merged_dict is defined + - slurm_merged_dict.get(item) is defined + - slurm_merged_dict.get(item) is not none + - >- + (slurm_merged_dict.get(item) is string + and slurm_merged_dict.get(item) | length > 0) + or (slurm_merged_dict.get(item) is iterable + and slurm_merged_dict.get(item) | list | length > 0) + - >- + not ((slurm_merged_dict.get(item) is string + and slurm_merged_dict.get(item) | regex_search('^/')) + or (slurm_merged_dict.get(item) is iterable + and (slurm_merged_dict.get(item) | first) | regex_search('^/'))) + loop: + - SlurmctldLogFile + - SlurmdLogFile + - StateSaveLocation + - SlurmdSpoolDir + - SlurmctldPidFile + - SlurmdPidFile + - Epilog + - Prolog + - EpilogSlurmctld + - PrologSlurmctld + - SlurmSchedLogFile + - PluginDir + - PlugStackConfig + - SrunEpilog + - SrunProlog + - TaskEpilog + - TaskProlog + - HealthCheckProgram + - RebootProgram + - UnkillableStepProgram + - ResvEpilog + - ResvProlog + - TmpFS + - JobCompLoc + - JobCredentialPrivateKey + - JobCredentialPublicCertificate + +# ── slurmdbd.conf path validation ──────────────────────────────────── + +- name: Validate slurmdbd.conf path parameters are absolute + ansible.builtin.fail: + msg: "slurmdbd.conf {{ item }} must be an absolute path (start with /). Current value: {{ slurmdbd_merged_dict.get(item) }}" + when: + - slurmdbd_merged_dict is defined + - slurmdbd_merged_dict.get(item) is defined + - slurmdbd_merged_dict.get(item) is not none + - >- + (slurmdbd_merged_dict.get(item) is string + and slurmdbd_merged_dict.get(item) | length > 0) + or (slurmdbd_merged_dict.get(item) is iterable + and slurmdbd_merged_dict.get(item) | list | length > 0) + - >- + not ((slurmdbd_merged_dict.get(item) is string + and slurmdbd_merged_dict.get(item) | regex_search('^/')) + or (slurmdbd_merged_dict.get(item) is iterable + and (slurmdbd_merged_dict.get(item) | first) | regex_search('^/'))) + loop: + - LogFile + - PidFile + - PluginDir + +# ── cgroup.conf path validation ────────────────────────────────────── + +- name: Validate cgroup.conf path parameters are absolute + ansible.builtin.fail: + msg: "cgroup.conf {{ item }} must be an absolute path (start with /). Current value: {{ cgroup_merged_dict.get(item) }}" + when: + - cgroup_merged_dict is defined + - cgroup_merged_dict.get(item) is defined + - cgroup_merged_dict.get(item) is not none + - >- + (cgroup_merged_dict.get(item) is string + and cgroup_merged_dict.get(item) | length > 0) + or (cgroup_merged_dict.get(item) is iterable + and cgroup_merged_dict.get(item) | list | length > 0) + - >- + not ((cgroup_merged_dict.get(item) is string + and cgroup_merged_dict.get(item) | regex_search('^/')) + or (cgroup_merged_dict.get(item) is iterable + and (cgroup_merged_dict.get(item) | first) | regex_search('^/'))) + loop: + - CgroupMountpoint diff --git a/discovery/roles/slurm_config/templates/slurmd.service.j2 b/discovery/roles/slurm_config/templates/slurmd.service.j2 deleted file mode 100644 index 294d1fda75..0000000000 --- a/discovery/roles/slurm_config/templates/slurmd.service.j2 +++ /dev/null @@ -1,22 +0,0 @@ -[Unit] -Description=Slurm node daemon -After=munge.service network-online.target remote-fs.target sssd.service -Wants=network-online.target - -[Service] -Type=notify -EnvironmentFile=-/etc/sysconfig/slurmd -EnvironmentFile=-/etc/default/slurmd -RuntimeDirectory=slurm -RuntimeDirectoryMode=0755 -ExecStart=/usr/sbin/slurmd --systemd $SLURMD_OPTIONS {{ conf_server }} -ExecReload=/bin/kill -HUP $MAINPID -KillMode=process -LimitNOFILE=131072 -LimitMEMLOCK=infinity -LimitSTACK=infinity -Delegate=yes -TasksMax=infinity - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 3a8c43ad93..d708eb0777 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -21,6 +21,34 @@ conf_files: # Must match this MASTER list - slurmdbd - cgroup - gres + - acct_gather + - helpers + - job_container + - mpi + - oci + - topology + - burst_buffer + +# Supported configuration files are: + # slurm.conf + # slurmdbd.conf + # cgroup.conf + # gres.conf + # acct_gather.conf + # helpers.conf + # job_container.conf + # mpi.conf + # oci.conf + # topology.conf + # burst_buffer.conf + +# Non Conf files + # topology.yaml + # namespace.yaml + # plugstack.conf + # scrun.lua + # cli_filter.lua + copy_from_oim: false common_dir: - /etc/munge @@ -38,8 +66,9 @@ gpu_slurm_conf: SelectType: select/cons_tres SelectTypeParameters: CR_Core_Memory SlurmdParameters: l3cache_as_socket -innodb_buffer_pool_size: 1G -innodb_lock_wait_timeout: 120 +innodb_buffer_pool_size: 4G +innodb_lock_wait_timeout: 900 +conf_server: "--conf-server {{ ctld_list | join(',') }}" # TODO tmp nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" bmc_username: "{{ hostvars['localhost']['bmc_username'] }}" @@ -51,7 +80,7 @@ cluster_name: cluster # TODO: direct load vars omnia_config.yml slurm_uid: 6001 slurm_user: slurm slurm_user_group: slurm -restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] }}" +restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] | default(true) }}" slurm_db_username: "{{ hostvars['localhost']['slurm_db_username'] | default('dbuser') }}" slurm_db_password: "{{ hostvars['localhost']['slurm_db_password'] }}" slurm_db_host: "{{ hostvars['localhost']['slurm_db_host'] | default(false) }}" @@ -89,15 +118,18 @@ munge_dir_mode: "0700" common_mode: "0755" slurm_dbd_mode: "0600" slurm_db_cnf_mode: "0600" +node_drain_timeout: 900 +node_drain_delay: 30 +force_scancel_node: false dbd_slurm_conf: - AccountingStorageHost: "{{ dbd_list[0] }}" AccountingStoragePort: "{{ slurm_dbd_port }}" AccountingStorageType: accounting_storage/slurmdbd partition_params: PartitionName: "{{ slurm_partition_name }}" - Nodes: "{{ cmpt_list | join(',') }}" + Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}" MaxTime: "INFINITE" State: "UP" + Default: "YES" openldap_dir_name: "openldap/" software_config_file: "{{ input_project_dir }}/software_config.json" omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}" @@ -113,21 +145,52 @@ packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64" offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" packages_layout_x86_64: - - doca-ofed - cuda packages_layout_aarch64: - - doca-ofed - cuda print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" -offline_path_x86_64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" -offline_path_aarch64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" +offline_path_x86_64: [] +offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa +# nvidia sdk vars +# Fully resolved tarball relative paths (no nested Jinja2) +# nvidia sdk vars +nvhpc_pkg_name_x86_64: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" +nvhpc_pkg_name_aarch64: "nvhpc_2025_2511_Linux_aarch64_cuda_13.0" + +nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_pkg_name_x86_64 }}/{{ nvhpc_pkg_name_x86_64 }}.tar.gz" +nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/10.0/tarball/{{ nvhpc_pkg_name_aarch64 }}/{{ nvhpc_pkg_name_aarch64 }}.tar.gz" + +nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" + +# parallel file copy +parallel_copy_max_workers: 4 + +# ------------------------------------------------------------ +# Parallel Copy Candidates (Only path existence matters) +# ------------------------------------------------------------ + +parallel_copy_candidates: + + # CUDA Runfile (aarch64 repo path) + - name: cuda_runfile_aarch64 + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + + # CUDA Runfile (x86_64 repo path) + - name: cuda_runfile_x86_64 + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + + # NVIDIA HPC SDK (x86_64 tarball extracted dir) + - name: nvhpc_sdk_x86_64 + src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" + dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" + + - name: nvhpc_sdk_aarch64 + src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_aarch64_relpath | dirname }}/" + dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" + backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" diff --git a/discovery/roles/telemetry/tasks/check_pxe_changes.yml b/discovery/roles/telemetry/tasks/check_pxe_changes.yml new file mode 100644 index 0000000000..398c831961 --- /dev/null +++ b/discovery/roles/telemetry/tasks/check_pxe_changes.yml @@ -0,0 +1,88 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if current PXE mapping file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + delegate_to: localhost + register: current_pxe_file + +- name: Check if backup PXE mapping file exists + ansible.builtin.stat: + path: "{{ backup_pxe_mapping_ldms_path }}" + delegate_to: localhost + register: backup_pxe_file + +- name: Handle first discovery run (no backup exists) + when: + - current_pxe_file.stat.exists + - not backup_pxe_file.stat.exists + block: + - name: Create backup of PXE mapping file + ansible.builtin.copy: + src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + dest: "{{ backup_pxe_mapping_ldms_path }}" + remote_src: true + mode: preserve + delegate_to: localhost + + - name: Set pxe_changed to false for first run + ansible.builtin.set_fact: + pxe_changed: false + + - name: Display first run message + ansible.builtin.debug: + msg: "{{ pxe_first_run_msg }}" + +- name: Compare PXE mapping files when backup exists + when: + - current_pxe_file.stat.exists + - backup_pxe_file.stat.exists + block: + - name: Get checksum of current PXE mapping file + ansible.builtin.stat: + path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + checksum_algorithm: sha256 + delegate_to: localhost + register: current_pxe_checksum + + - name: Get checksum of backup PXE mapping file + ansible.builtin.stat: + path: "{{ backup_pxe_mapping_ldms_path }}" + checksum_algorithm: sha256 + delegate_to: localhost + register: backup_pxe_checksum + + - name: Set pxe_changed based on checksum comparison + ansible.builtin.set_fact: + pxe_changed: "{{ current_pxe_checksum.stat.checksum != backup_pxe_checksum.stat.checksum }}" + + - name: Update backup PXE mapping file when changed + ansible.builtin.copy: + src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + dest: "{{ backup_pxe_mapping_ldms_path }}" + remote_src: true + mode: preserve + delegate_to: localhost + when: pxe_changed | bool + + - name: Display PXE change status + ansible.builtin.debug: + msg: "{{ pxe_changed_msg if (pxe_changed | bool) else pxe_no_change_msg }}" + +- name: Set pxe_changed to false when PXE file is missing + ansible.builtin.set_fact: + pxe_changed: false + when: not current_pxe_file.stat.exists diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml index c5a3dbefba..e4e3d1846a 100644 --- a/discovery/roles/telemetry/tasks/main.yml +++ b/discovery/roles/telemetry/tasks/main.yml @@ -28,6 +28,10 @@ when: - hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] block: + - name: Set NFS info fact + ansible.builtin.set_fact: + oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" + - name: Service cluster prerequisite ansible.builtin.include_tasks: telemetry_prereq.yml @@ -51,3 +55,13 @@ - name: Update ldms agg configuration ansible.builtin.include_tasks: update_ldms_agg_config.yml when: hostvars['localhost']['ldms_support'] + +- name: Check if PXE mapping has changed since last run + ansible.builtin.include_tasks: check_pxe_changes.yml + when: hostvars['localhost']['ldms_support'] + +- name: Restart LDMS configs for node addition and deletion + ansible.builtin.include_tasks: restart_ldms_configs.yml + when: + - hostvars['localhost']['ldms_support'] + - pxe_changed | default(false) | bool diff --git a/discovery/roles/telemetry/tasks/restart_ldms_configs.yml b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml new file mode 100644 index 0000000000..0a176118f0 --- /dev/null +++ b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml @@ -0,0 +1,151 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Load high availability config + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/high_availability_config.yml" + name: ha_config + +- name: Set kube_vip fact + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + +- name: Test SSH connectivity to kube VIP only when PXE has changed + when: + - kube_vip | length > 0 + - pxe_changed | default(false) | bool + block: + - name: SSH test to kube VIP + ansible.builtin.command: + cmd: "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes {{ kube_vip }} echo reachable" + delegate_to: localhost + register: kube_vip_ssh_check + changed_when: false + + - name: Set kube VIP reachable fact + ansible.builtin.set_fact: + kube_vip_reachable: "{{ kube_vip_ssh_check.rc == 0 }}" + + rescue: + - name: Display kube VIP unreachable message + ansible.builtin.debug: + msg: "{{ kube_vip_unreachable_msg }}" + + - name: Set kube VIP reachable fact to false + ansible.builtin.set_fact: + kube_vip_reachable: false + +- name: Restart LDMS aggregator when PXE has changed + when: pxe_changed | default(false) | bool + block: + - name: Check if LDMS aggregator is running on service k8s cluster + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: StatefulSet + name: nersc-ldms-aggr + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + register: ldms_statefulset_info + failed_when: false + when: + - kube_vip_reachable | bool + + - name: Set LDMS running state + ansible.builtin.set_fact: + ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}" + when: + - kube_vip_reachable | bool + + - name: Check if LDMS conf ConfigMap file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml" + register: ldms_conf_file + when: ldms_running | default(false) | bool + + - name: Check if LDMS bin ConfigMap file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml" + register: ldms_bin_file + when: ldms_running | default(false) | bool + + - name: Apply LDMS configuration ConfigMap + kubernetes.core.k8s: + state: present + src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml" + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + + - name: Apply LDMS scripts ConfigMap + kubernetes.core.k8s: + state: present + src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml" + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_bin_file.stat.exists | default(false) + + - name: Restart LDMS aggregator StatefulSet + kubernetes.core.k8s: + state: present + definition: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: nersc-ldms-aggr + namespace: "{{ telemetry_namespace }}" + spec: + template: + metadata: + annotations: + kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) + + - name: Wait for LDMS aggregator pod to be ready after restart + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ telemetry_namespace }}" + label_selectors: + - "app=nersc-ldms-aggr" + wait: true + wait_condition: + type: Ready + status: "True" + wait_timeout: 120 + delegate_to: "{{ kube_vip }}" + register: ldms_pod_ready + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) + + - name: Display LDMS aggregator restart status + ansible.builtin.debug: + msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}" + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) diff --git a/discovery/roles/telemetry/tasks/telemetry_prereq.yml b/discovery/roles/telemetry/tasks/telemetry_prereq.yml index d720c57822..7eb45a89ab 100644 --- a/discovery/roles/telemetry/tasks/telemetry_prereq.yml +++ b/discovery/roles/telemetry/tasks/telemetry_prereq.yml @@ -47,23 +47,24 @@ state: directory mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" -- name: Git clone for iDRAC Telemetry script +- name: Ensure iDRAC Telemetry scripting destination exists + ansible.builtin.file: + path: "{{ idrac_telemetry_scripting_git_clone_path }}" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + +- name: Copy iDRAC Telemetry Scripting to NFS share block: - - name: Checkout iDRAC Telemetry GitHub repo - ansible.builtin.git: - repo: "{{ idrac_telemetry_scripting_repo }}" + - name: Copy pre-cloned iDRAC Telemetry Scripting directory + ansible.builtin.copy: + src: "{{ idrac_telemetry_scripting_src_path }}/" dest: "{{ idrac_telemetry_scripting_git_clone_path }}" - version: "{{ idrac_telemetry_scripting_stable_commit }}" - update: false - register: clone_idrac_script - until: clone_idrac_script is succeeded - retries: "{{ max_retries }}" - delay: "{{ delay_count }}" + remote_src: true + mode: preserve rescue: - - name: Fail if iDRAC telemetry Git clone fails + - name: Fail if iDRAC telemetry copy fails ansible.builtin.fail: - msg: "{{ idrac_script_git_clone_error_msg.splitlines() | join(' ') }}" - when: clone_idrac_script is failed + msg: "{{ idrac_telemetry_scripting_copy_fail_msg.splitlines() | join(' ') }}" - name: Set kafka_support to true ansible.builtin.set_fact: diff --git a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml index db4d4b1d3f..ee6c0c7d75 100644 --- a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml +++ b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Install make - ansible.builtin.package: - name: make - state: present - - name: Verify values.yaml exists ansible.builtin.stat: path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/values.yaml" diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml index 473fd74e19..69b0c0c0ac 100644 --- a/discovery/roles/telemetry/vars/main.yml +++ b/discovery/roles/telemetry/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,14 +32,12 @@ telemetry_namespace: "telemetry" idrac_telemetry_k8s_name: idrac-telemetry # iDRAC Telemetry scripting repository -idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git" -idrac_telemetry_scripting_stable_commit: "f6999f5" +idrac_telemetry_scripting_src_path: "{{ oim_shared_path }}/omnia/telemetry/iDRAC-Telemetry-Scripting" idrac_telemetry_scripting_git_clone_path: "{{ service_cluster_idrac_telemetry_dir_path }}/iDRAC-Telemetry-Scripting" -idrac_script_git_clone_error_msg: | - Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }} - to {{ idrac_telemetry_scripting_git_clone_path }} directory in NFS share. -max_retries: 10 -delay_count: 5 +idrac_telemetry_scripting_copy_fail_msg: | + Failed to copy iDRAC Telemetry Scripting from {{ idrac_telemetry_scripting_src_path }} + to {{ idrac_telemetry_scripting_git_clone_path }}. Please ensure prepare_oim.yml has been + executed successfully before running discovery. # Pre-built container images for iDRAC telemetry components # These default to your published images but can be overridden via telemetry_images @@ -254,3 +252,24 @@ common_templates: skip_when: "{{ cluster_id_present | default(false) }}" - src: 'telemetry/kustomization.yaml.j2' dest: 'kustomization.yaml' + +# Usage: check_pxe_changes.yml +backup_pxe_mapping_ldms_path: "/opt/omnia/telemetry/backup_pxe_mapping_ldms.csv" +pxe_first_run_msg: "First discovery run detected. Saving PXE mapping backup. LDMS restart not required." +pxe_no_change_msg: "PXE mapping file has not changed since last run. Skipping LDMS restart." +pxe_changed_msg: "PXE mapping file has changed. LDMS restart will be triggered." + +# Usage: restart_ldms_configs.yml +kube_vip_unreachable_msg: >- + Kube VIP ({{ kube_vip }}) is not reachable via SSH. + There might be issues with the k8s cluster. + LDMS aggregator restart will be skipped. + + After discovery completes, manually restart the LDMS aggregator pod with: + + ssh {{ kube_vip }} + kubectl rollout restart statefulset nersc-ldms-aggr -n {{ telemetry_namespace }} + kubectl get pods -n {{ telemetry_namespace }} -l app=nersc-ldms-aggr -w + +ldms_pod_ready_msg: "LDMS aggregator pod is ready." +ldms_pod_not_ready_msg: "WARNING: LDMS aggregator pod did not become ready within 120s." diff --git a/input/config/aarch64/rhel/10.0/additional_packages.json b/input/config/aarch64/rhel/10.0/additional_packages.json index b01c3f78b5..0d6d9a0452 100644 --- a/input/config/aarch64/rhel/10.0/additional_packages.json +++ b/input/config/aarch64/rhel/10.0/additional_packages.json @@ -4,6 +4,21 @@ ] }, + "service_kube_control_plane_first": { + "cluster": [ + + ] + }, + "service_kube_control_plane": { + "cluster": [ + + ] + }, + "service_kube_node": { + "cluster": [ + + ] + }, "slurm_control_node": { "cluster": [ diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 2483775495..db95f2f5fb 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -9,10 +9,7 @@ {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", - "type": "iso", - "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" - } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { @@ -32,7 +29,13 @@ {"package": "cuda-run", "type": "iso", "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux_sbsa.run" + }, + { + "package": "nvhpc_2025_2511_Linux_aarch64_cuda_13.0", + "type": "tarball", + "url": "https://developer.download.nvidia.com/hpc-sdk/25.11/nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz" } + ] }, "login_node":{ diff --git a/input/config/x86_64/rhel/10.0/default_packages.json b/input/config/x86_64/rhel/10.0/default_packages.json index 813f9ad993..6002894568 100644 --- a/input/config/x86_64/rhel/10.0/default_packages.json +++ b/input/config/x86_64/rhel/10.0/default_packages.json @@ -34,7 +34,8 @@ {"package": "wget", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "cloud-init", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "glibc-langpack-en", "type": "rpm", "repo_name": "x86_64_baseos"}, - {"package": "gedit", "type": "rpm", "repo_name": "epel"} + {"package": "gedit", "type": "rpm", "repo_name": "epel"}, + {"package": "docker.io/dellhpcomniaaisolution/image-build-el10", "tag": "1.0", "type": "image" } ] } } diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index afc073a19f..0ef4408a7f 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -33,7 +33,7 @@ { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"} + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "service_kube_control_plane": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 9531239fd2..2b33b0de90 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -7,10 +7,7 @@ {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", - "type": "iso", - "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" - } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { @@ -34,6 +31,11 @@ {"package": "cuda-run", "type": "iso", "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run" + }, + { + "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0", + "type": "tarball", + "url": "https://developer.download.nvidia.com/hpc-sdk/25.11/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz" } ] }, diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 55583e1a07..8428e6d94c 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -18,7 +18,20 @@ # ================================ # VARIABLE DETAILS # ================================ -# 1. user_repo_url_x86_64 +# 1. user_registry +#-------------------------- +# Configuration for user registry to configure additional images in Pulp +# Fields: +# host : Registry IP and port in format "IP:port" +# cert_path : Path to SSL certificate file (.crt) - Required only if host is using HTTPS +# key_path : Path to SSL private key file (.key) - Required only if host is using HTTPS +# Notes: +# - If host is HTTPS, cert_path and key_path are required +# - If host is HTTP, cert_path and key_path can be left empty +# - cert_path should point to .crt files only +# - key_path should point to .key files only +# - cert and key paths are accessed from within the omnia_core container +# 2. user_repo_url_x86_64 #-------------------------- # Optional list of user-defined repository URLs for x86_64 architecture. # Each entry can include: url, gpgkey, sslcacert, sslclientkey, sslclientcert, name, policy. @@ -36,7 +49,7 @@ # - Omit SSL fields entirely if SSL is not in use. # - Its a madatory field in case of slurm_custom with name as '_slurm_custom' # -# 2. user_repo_url_aarch64 +# 3. user_repo_url_aarch64 #--------------------------- # Same as above but for aarch64 architecture. # @@ -106,7 +119,9 @@ # ================================ # VARIABLES # ================================ -# Example +# user_registry: +# - { host: "172.16.107.254:4000", cert_path: "/opt/omnia/domain.crt", key_path: "/opt/omnia/domain.key" } +user_registry: # user_repo_url_x86_64: # - { url: "", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_slurm_custom" } user_repo_url_x86_64: @@ -123,10 +138,12 @@ omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key'", name: "cri-o"} + - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"} omnia_repo_url_rhel_aarch64: - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/repodata/repomd.xml.key", name: "doca"} # Example: # additional_repos_x86_64: # - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" } diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 3c4b3dbc35..943d70e530 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -27,27 +27,50 @@ # Storage name corresponding to the NFS share to be used by slurm cluster # This should match with exactly with a entry in storage_config.yml +# skip_merge +# Variable indicates whether a specific configuration file path +# under config_sources should be used as-is without merging +# If skip_merge is set to true for a configuration source path, +# that configuration file will be applied directly +# without merging with defaults or existing configurations +# It accepts true and false values +# Default value is false + # config_sources # defines how the Slurm configuration files are provided to the cluster. # : # or # Supply the configuration values directly as a key–value map -# Supply the absolute path to a custom configuration file on the OIM server +# Supply the absolute path to a custom configuration file # The conf files supported by slurm are # slurm # cgroup # slurmdbd # gres +# acct_gather +# helpers +# job_container # mpi +# oci +# topology +# burst_buffer # Thes files will be written into the slurm_config directory with .conf suffix slurm_cluster: - cluster_name: slurm_cluster nfs_storage_name: nfs_slurm + # skip_merge: true # config_sources: # slurm: # SlurmctldTimeout: 60 # SlurmdTimeout: 150 + # NodeName: + # - NodeName: newnode1 + # CPUs: 16 + # RealMemory: 64000 + # - NodeName: newnode2 + # CPUs: 16 + # RealMemory: 64000 # cgroup: # CgroupPlugin: autodetect # ConstrainCores: True @@ -62,7 +85,6 @@ slurm_cluster: # cgroup: /path/to/custom_cgroup.conf # slurmdbd: /path/to/custom_slurmdbd.conf # gres: /path/to/custom_gres.conf - # mpi: /path/to/custom_mpi.conf # ----------------------------SERVICE K8S------------------------------------------------------ # For service k8s cluster below parameters are required,(List) diff --git a/input/storage_config.yml b/input/storage_config.yml index 48eac2d5cc..399bf42fd6 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,28 +19,23 @@ # -----------------------------Powervault------------------------------------------- # powervault_config -# ip: ipv4 -# A list of PowerVault controller IP addresses used for iSCSI target discovery and login. -# In this configuration, a single controller portal is provided. - -# port: -# Defines the TCP port for the iSCSI target service. -# Port 3260 is the standard port for iSCSI communication. +# Mandatory when using PowerVault for persistent storage. +# Below parameters are mandatory when powervault_config is defined + # ip: A list of PowerVault controller ipv4 addresses used for iSCSI target discovery and login. + # iscsi_initiator: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array. + # volume_id: This is the unique WWN/identifier for the specific volume that should be used for persistent storage. This value is used for multipath scanning to select the correct mapped device. -# isci_initiators: -# Specifies the InitiatorName used by the host when connecting to the iSCSI target. -# This IQN uniquely identifies the host to the storage array. +# Below are the optional parameters when powervault_config is defined + # port: Defines the TCP port for the iSCSI target service. When port is not specified, default port used will be 3260 -# volume_id: -# This is the unique WWN/identifier for the -# specific volume that should be used for persistent storage. -# The script uses this value during multipath scanning to select the correct mapped device +# Below is an example on how to configure powervault_config +# In this configuration, a single controller portal is provided. #powervault_config: # ip: # - 172.1.2.3 # port: 3260 -# isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 +# iscsi_initiator: iqn.2025-01.com.dell:scontrol-node # volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 diff --git a/input_validation/roles/validate_input/tasks/main.yml b/input_validation/roles/validate_input/tasks/main.yml index ff11c79950..de6e9f48e9 100644 --- a/input_validation/roles/validate_input/tasks/main.yml +++ b/input_validation/roles/validate_input/tasks/main.yml @@ -17,20 +17,26 @@ omnia_run_tags: "{{ ansible_run_tags | default([]) }}" when: omnia_run_tags is not defined +- name: Set validation messages + ansible.builtin.set_fact: + validation_success_msg: "{{ messages.validation_success }}" + validation_error_msg: "{{ messages.validation_error }}" + - name: Validate omnia input config - vars: - # Note: When running a specific playbook without tags ansible run tags will default to ["all"], thus if two or more tags are present - # then the "all" tag should be removed so that only the config files related to that playbook are validated. - input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2 - else omnia_run_tags | default([]) }}" - validate_input: - omnia_base_dir: "{{ (input_dir + '/../') | ansible.builtin.realpath }}" - project_name: "{{ project_name }}" - tag_names: "{{ input_validate_tags }}" - module_utils_path: "{{ (role_path + '/../../../common/library/module_utils/') | ansible.builtin.realpath }}" - register: validation_status - when: (input_validate_tags | length) > 0 + block: + - name: Run validation + validate_input: + omnia_base_dir: "{{ (input_dir + '/../') | ansible.builtin.realpath }}" + project_name: "{{ project_name }}" + tag_names: "{{ input_validate_tags }}" + module_utils_path: "{{ (role_path + '/../../../common/library/module_utils/') | ansible.builtin.realpath }}" + register: validation_status + when: (input_validate_tags | length) > 0 -- name: Debug validation status - ansible.builtin.debug: - msg: "{{ messages.validation_success }}" + - name: Debug validation status + ansible.builtin.debug: + msg: "{{ validation_success_msg }}" + rescue: + - name: Failed due to validation failure + ansible.builtin.fail: + msg: "{{ validation_error_msg }}" diff --git a/input_validation/roles/validate_input/vars/main.yml b/input_validation/roles/validate_input/vars/main.yml index 3c6f2b1aff..698eb4da29 100644 --- a/input_validation/roles/validate_input/vars/main.yml +++ b/input_validation/roles/validate_input/vars/main.yml @@ -16,5 +16,11 @@ input_dir: "{{ hostvars['localhost']['input_project_dir'] }}" project_name: "{{ hostvars['localhost']['project_name'] }}" +# Note: When running a specific playbook without tags ansible run tags will default to ["all"], thus if two or more tags are present +# then the "all" tag should be removed so that only the config files related to that playbook are validated. +input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2 + else omnia_run_tags | default([]) }}" + messages: validation_success: "Successfully validated Omnia input config file(s)" + validation_error: "Input validation failed. Please check the validation output above for detailed error information." diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml index cb394fa845..963715b5e3 100644 --- a/local_repo/local_repo.yml +++ b/local_repo/local_repo.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local @@ -114,7 +117,7 @@ connection: ssh gather_facts: false tasks: - - name: Read network_spec vars + - name: Validate Pulp Container and Endpoint ansible.builtin.include_role: name: pulp_validation diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml new file mode 100644 index 0000000000..6f54e5f45f --- /dev/null +++ b/local_repo/pulp_cleanup.yml @@ -0,0 +1,101 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Pulp Cleanup Playbook - Clean Architecture +# +# Usage: +# # Repository cleanup (include architecture prefix) +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel,aarch64_epel" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_appstream" +# ansible-playbook pulp_cleanup.yml -e "cleanup_containers=nginx,redis" +# ansible-playbook pulp_cleanup.yml -e "cleanup_files=git,chart-0.48.0" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel -e cleanup_containers=nginx -e force=true" +# +# # Examples: x86_64_epel, aarch64_epel, x86_64_appstream, aarch64_baseos +# # Note: Use architecture prefix (x86_64_ or aarch64_) for repository names + +- name: Pulp Cleanup + hosts: localhost + connection: local + gather_facts: false + + pre_tasks: + # Step 1: Input Validation + - name: Validate input - at least one cleanup type must be specified + ansible.builtin.assert: + that: + - (cleanup_repos | default([]) | length > 0) or (cleanup_containers | default([]) | length > 0) or (cleanup_files | default([]) | length > 0) + fail_msg: | + No cleanup items specified. Please provide at least one of: + cleanup_repos: ['repo1', 'repo2'] + cleanup_containers: ['container1', 'container2'] + cleanup_files: ['file1', 'file2'] + + # Step 2: User Confirmation + - name: Parse cleanup lists + ansible.builtin.set_fact: + repo_list: "{{ cleanup_repos.split(',') | map('trim') | list if cleanup_repos is string else (cleanup_repos | default([])) }}" + container_list: "{{ cleanup_containers.split(',') | map('trim') | list if cleanup_containers is string else (cleanup_containers | default([])) }}" + file_list: "{{ cleanup_files.split(',') | map('trim') | list if cleanup_files is string else (cleanup_files | default([])) }}" + + - name: Display cleanup summary + ansible.builtin.debug: + msg: + - "========== CLEANUP SUMMARY ==========" + - "Repositories : {{ (repo_list | default([]) | join(', ')) if repo_list | default([]) | length > 0 else 'None' }}" + - "Containers : {{ (container_list | default([]) | join(', ')) if cleanup_containers | default([]) | length > 0 else 'None' }}" + - "Files : {{ (file_list | default([]) | join(', ')) if cleanup_files | default([]) | length > 0 else 'None' }}" + - "=====================================" + - name: Get user confirmation + ansible.builtin.pause: + prompt: | + + ⚠️ WARNING: This will permanently delete the specified artifacts. + This action cannot be undone. + + Type 'yes' to continue or press Ctrl+C to abort + register: user_input + when: not (force | default(false)) | bool + + - name: Abort if not confirmed + ansible.builtin.fail: + msg: "Cleanup cancelled by user" + when: + - not (force | default(false)) | bool + - user_input.user_input | default('') | lower != 'yes' + + tasks: + # Step 3: Call Python Module + - name: Execute cleanup + pulp_cleanup: + cleanup_repos: "{{ repo_list | default([]) }}" + cleanup_containers: "{{ container_list | default([]) }}" + cleanup_files: "{{ file_list | default([]) }}" + base_path: "{{ base_path | default('/opt/omnia/log/local_repo') }}" + repo_store_path: "{{ repo_store_path | default('/opt/omnia') }}" + register: cleanup_result + + post_tasks: + # Step 4: Display Results + - name: Display cleanup results + ansible.builtin.debug: + msg: "{{ cleanup_result.pretty_table_lines }}" + + - name: Display summary + ansible.builtin.debug: + msg: + - "========== CLEANUP COMPLETED ==========" + - "Total: {{ cleanup_result.total }}, Success: {{ cleanup_result.success_count }}, Failed: {{ cleanup_result.failed_count }}" + - "Status file: {{ cleanup_result.status_file }}" + - "========================================" diff --git a/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml b/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml index 9df565f229..3f44ccdeb0 100644 --- a/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml +++ b/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,8 +28,8 @@ local_repo_config_path: "{{ local_repo_config_path }}" arch: "{{ item.arch }}" overall_status_dict: {} - user_reg_cred_input: "{{ user_reg_cred_input }}" - user_reg_key_path: "{{ user_reg_key_path }}" + # user_reg_cred_input: "{{ user_reg_cred_input }}" + # user_reg_key_path: "{{ user_reg_key_path }}" omnia_credentials_yaml_path: "{{ omnia_credentials_yaml_path }}" omnia_credentials_vault_path: "{{ omnia_credentials_vault_path }}" nthreads: "{{ (local_repo_py_module_vars[item.key].nthreads | default(local_repo_py_module_vars.default_vars.nthreads)) }}" diff --git a/local_repo/roles/parse_and_download/vars/main.yml b/local_repo/roles/parse_and_download/vars/main.yml index 90141225b6..74b24cd1c2 100644 --- a/local_repo/roles/parse_and_download/vars/main.yml +++ b/local_repo/roles/parse_and_download/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,8 +27,8 @@ local_repo_config_path: "{{ project_input_path }}/local_repo_config.yml" sw_config_json_path: "{{ project_input_path }}/software_config.json" functional_groups_config_path: "{{ nfs_shared_path }}/.data/functional_groups_config.yml" user_json_file: "{{ project_input_path }}/software_config.json" -user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" -user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" +# user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" +# user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" omnia_credentials_yaml_path: "{{ project_input_path }}/omnia_config_credentials.yml" omnia_credentials_vault_path: "{{ project_input_path }}/.omnia_config_credentials_key" clean_rpms: true diff --git a/local_repo/roles/validation/tasks/check_additional_packages_images.yml b/local_repo/roles/validation/tasks/check_additional_packages_images.yml new file mode 100644 index 0000000000..3b5663095b --- /dev/null +++ b/local_repo/roles/validation/tasks/check_additional_packages_images.yml @@ -0,0 +1,50 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load local_repo_config.yml + ansible.builtin.include_vars: + file: "{{ local_repo_config_file }}" + name: local_repo_config + +- name: Check if additional_packages is enabled in software_config + ansible.builtin.set_fact: + additional_packages_enabled: "{{ software | selectattr('name', 'equalto', 'additional_packages') | list | length > 0 }}" + +- name: Get additional_packages architectures + ansible.builtin.set_fact: + additional_packages_archs: "{{ (software | selectattr('name', 'equalto', 'additional_packages') | first).arch | default([]) }}" + when: additional_packages_enabled + +- name: Check for image packages in additional_packages.json + when: additional_packages_enabled + block: + - name: Initialize image found flag + ansible.builtin.set_fact: + has_image_packages: false + + - name: Check each architecture for image packages + ansible.builtin.include_tasks: check_images_per_arch.yml + loop: "{{ additional_packages_archs }}" + loop_control: + loop_var: arch_item + when: additional_packages_archs is defined + + - name: Display warning if images found in additional_packages.json but user_registry not defined + ansible.builtin.pause: + prompt: "{{ additional_packages_image_warning_msg }}" + seconds: "{{ warning_wait_time_warning }}" + when: + - has_image_packages | bool + - local_repo_config.user_registry is not defined or local_repo_config.user_registry is none or local_repo_config.user_registry | length == 0 diff --git a/local_repo/roles/validation/tasks/check_images_per_arch.yml b/local_repo/roles/validation/tasks/check_images_per_arch.yml new file mode 100644 index 0000000000..aa20840e3e --- /dev/null +++ b/local_repo/roles/validation/tasks/check_images_per_arch.yml @@ -0,0 +1,43 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set additional_packages.json path for {{ arch_item }} + ansible.builtin.set_fact: + additional_packages_path: "{{ project_input_path }}/config/{{ arch_item }}/{{ cluster_os_type }}/{{ cluster_os_version }}/additional_packages.json" + +- name: Check if additional_packages.json exists for {{ arch_item }} + ansible.builtin.stat: + path: "{{ additional_packages_path }}" + register: additional_packages_file + +- name: Load and check additional_packages.json for {{ arch_item }} + when: additional_packages_file.stat.exists + block: + - name: Load additional_packages.json + ansible.builtin.include_vars: + file: "{{ additional_packages_path }}" + name: additional_packages_data + + - name: Check for image type packages in additional_packages + ansible.builtin.set_fact: + has_image_packages: true + when: > + additional_packages_data | dict2items | + selectattr('value.cluster', 'defined') | + map(attribute='value.cluster') | + flatten | + selectattr('type', 'defined') | + selectattr('type', 'equalto', 'image') | + list | length > 0 diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml index 6087ab200b..0f578af349 100644 --- a/local_repo/roles/validation/tasks/main.yml +++ b/local_repo/roles/validation/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,6 +22,9 @@ - name: Validate software_config.json ansible.builtin.include_tasks: validate_software_config_json.yml +- name: Check for images in additional_packages + ansible.builtin.include_tasks: check_additional_packages_images.yml + - name: Validate metadata ansible.builtin.include_tasks: validate_metadata.yml @@ -36,15 +39,14 @@ - name: Check user registry reachability check_user_registry: config_file: "{{ local_repo_config_file }}" - user_reg_cred_input: "{{ user_reg_cred_input }}" - user_reg_key_path: "{{ user_reg_key_path }}" + # user_reg_cred_input: "{{ user_reg_cred_input }}" + # user_reg_key_path: "{{ user_reg_key_path }}" timeout: "{{ time_out }}" register: registry_check_result -- name: Warning - Display unreachable registries - ansible.builtin.pause: - prompt: "{{ registry_check_result.unreachable_registries | join(', ') }}\n{{ user_registry_msg }}" - seconds: "{{ warning_wait_time_warning }}" +- name: Fail - Unreachable registries detected + ansible.builtin.fail: + msg: "{{ unreachable_registries_fail_msg }}" when: - registry_check_result.unreachable_registries is defined - registry_check_result.unreachable_registries | length > 0 diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index 83c0523e47..87e8733498 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,8 +44,8 @@ softwares_invalid_msg: "Invalid software_name(s) found: {{ softwares_list | diff # Usage: main.yml nfs_shared_path: "/opt/omnia" local_repo_config_file: "{{ project_input_path }}/local_repo_config.yml" -user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" -user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" +# user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" +# user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" var_mount_percentage_limit: 80 var_mount_overuse_msg: | [WARNING] local_repo.yml may fail as /var mount usage has exceeded the limit of {{ var_mount_percentage_limit }}%. @@ -144,8 +144,12 @@ user_registry_fail_msg: "Failed. Please ensure user_registry is non empty list a check if there is any indentation error in {{ project_input_path }}/local_repo_config.yml" user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry should have 'host' and 'cert_path' keys defined" time_out: 30 -user_registry_msg: "Above host registries are not reachable. If the user registry is not accessible from the Omnia Infrastructure Manager, Omnia will download all the images for the software listed in software_config.json." # noqa: yaml[line-length] +user_registry_msg: "Above user registries is/are not reachable. Please make sure the user registry is accessible from the Omnia Infrastructure Manager." # noqa: yaml[line-length] +unreachable_registries_fail_msg: "Unreachable registries detected: {{ registry_check_result.unreachable_registries | join(', ') }}. {{ user_registry_msg }} Please check registry connectivity and configuration before proceeding." # noqa: yaml[line-length] cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in {{ project_input_path }}/local_repo_config.yml" # noqa: yaml[line-length] +additional_packages_image_warning_msg: | + WARNING: additional_packages.json contains packages of type 'image', but 'user_registry' is not defined in local_repo_config.yml. + Please specify 'user_registry' in local_repo_config.yml if these images are coming from a user registry. # Usage: validate_user_repo_url.yml user_repo_url_fail_msg: "Failed. Please ensure user_repo_url is proper and should not have jinja variables. diff --git a/omnia.sh b/omnia.sh index c997d2ff97..530c168e7d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -52,6 +52,266 @@ is_local_ip() { fi } +# Version configuration variables +OMNIA_CORE_CONTAINER_TAG="2.1" # Default container tag +OMNIA_VERSION="" # Will be read from metadata +TARGET_OMNIA_VERSION="" # Target version for upgrade +TARGET_CONTAINER_TAG="" # Target container tag for upgrade + +# Centralized version list (in chronological order) +ALL_OMNIA_VERSIONS=("2.0.0.0" "2.1.0.0") + +# Container-side paths (used inside podman exec commands) +CONTAINER_INPUT_DIR="/opt/omnia/input" +CONTAINER_BACKUPS_DIR="/opt/omnia/backups" +CONTAINER_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" + +# Function to get available upgrade versions (higher than current) +get_available_upgrade_versions() { + local current_version="$1" + local available_versions=() + local version_descriptions=() + + # Find versions higher than current + local found_current=false + for version in "${ALL_OMNIA_VERSIONS[@]}"; do + if [ "$version" = "$current_version" ]; then + found_current=true + continue + fi + + if [ "$found_current" = true ]; then + available_versions+=("$version") + + # Generate description based on upgrade type + local current_tag=$(get_container_tag_from_version "$current_version") + local target_tag=$(get_container_tag_from_version "$version") + + if [ "$current_tag" = "$target_tag" ]; then + version_descriptions+=("Patch upgrade to $version (container restart only)") + else + version_descriptions+=("Major upgrade to $version (container swap required)") + fi + fi + done + + # Return arrays + printf '%s\n' "${available_versions[@]}" + printf '%s\n' "${version_descriptions[@]}" +} + +# Function to get available rollback versions (lower than current) +get_available_rollback_versions() { + local current_version="$1" + local available_versions=() + + # Find versions lower than current + for version in "${ALL_OMNIA_VERSIONS[@]}"; do + if [ "$version" = "$current_version" ]; then + break + fi + available_versions+=("$version") + done + + # Return array (reverse order for rollback - newest first) + local reversed_versions=() + for ((i=${#available_versions[@]}-1; i>=0; i--)); do + reversed_versions+=("${available_versions[$i]}") + done + + printf '%s\n' "${reversed_versions[@]}" +} + +# Function to perform same-tag rollback (container restart only) +rollback_same_tag() { + local target_version="$1" + local current_version="$2" + + echo "[INFO] [ROLLBACK] Phase: Same-Tag Rollback" + echo "[INFO] [ROLLBACK] Rolling back to $target_version within same container tag" + + # Verify container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ROLLBACK] Container is not running for same-tag rollback" + return 1 + fi + + echo "[INFO] [ROLLBACK] Updating metadata to version $target_version" + + # Update version metadata + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ROLLBACK] Failed to update metadata version" + echo "[ERROR] [ROLLBACK] Rollback failed: Could not update version metadata" + return 1 + fi + + echo "[INFO] [ROLLBACK] Restarting container to apply changes..." + + # Restart container to apply changes + if ! systemctl restart omnia_core.service; then + echo "[ERROR] [ROLLBACK] Failed to restart container service" + echo "[ERROR] [ROLLBACK] Rollback failed: Container restart failed" + return 1 + fi + + # Wait for container to be healthy after restart + echo "[INFO] [ROLLBACK] Waiting for container health check after restart (30s)" + local health_timeout=30 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ROLLBACK] Container is healthy after restart" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo "[ERROR] [ROLLBACK] Container failed to become healthy within 30 seconds after restart" + echo "[ERROR] [ROLLBACK] Rollback failed: Container health check failed" + return 1 + fi + + # Verify version update + local updated_version=$(get_current_omnia_version) + if [ "$updated_version" != "$target_version" ]; then + echo "[ERROR] [ROLLBACK] Version update verification failed" + echo "[ERROR] [ROLLBACK] Expected: $target_version, Found: $updated_version" + return 1 + fi + + echo "[INFO] [ROLLBACK] Same-tag rollback completed successfully" + echo "[INFO] [ROLLBACK] Version rolled back to: $target_version" + return 0 +} + +# Function to validate container image availability and show build instructions +validate_container_image() { + local target_version="$1" + local target_container_tag="$2" + local operation="${3:-upgrade}" + + echo -e "${BLUE}Validating target container image: omnia_core:$target_container_tag${NC}" + if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then + echo -e "${RED}ERROR: Target image missing locally: omnia_core:$target_container_tag${NC}" + echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" + echo -e "1. Clone the Omnia Artifactory repository:" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-$target_version" + echo -e "2. Navigate to the repository directory:" + echo -e " cd omnia-artifactory" + echo -e "3. Build the core image locally (loads into local Podman by default):" + echo -e " ./build_images.sh core core_tag=$target_container_tag omnia_branch=$target_version" + echo -e "Then re-run:" + echo -e " ./omnia.sh --$operation" + return 1 + fi + + echo -e "${GREEN}✓ Target image available locally: omnia_core:$target_container_tag${NC}" + return 0 +} + +# Function to get container tag from omnia version +get_container_tag_from_version() { + local version="$1" + case "$version" in + 2.0.*) + echo "1.0" + ;; + *) + echo "$(echo "$version" | awk -F. '{print $1"."$2}')" + ;; + esac +} + +# Function to read current omnia version from metadata +get_current_omnia_version() { + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + podman exec omnia_core cat /opt/omnia/.data/oim_metadata.yml 2>/dev/null | grep "omnia_version:" | awk '{print $2}' | tr -d '"' + else + echo "" + fi +} + +show_post_upgrade_instructions() { + local upgraded_version="$1" + + echo "" + echo -e "${YELLOW}================================================================================${NC}" + echo -e "${YELLOW} IMPORTANT POST-UPGRADE STEP${NC}" + echo -e "${YELLOW}================================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Omnia core container has been successfully upgraded${NC}" + echo -e "${GREEN}✓ Version updated to: $upgraded_version${NC}" + echo "" + echo -e "${BLUE}NEXT REQUIRED ACTION:${NC}" + echo -e "${YELLOW}You must now run the upgrade playbook inside the omnia_core container:${NC}" + echo "" + echo -e "${GREEN}podman exec -it omnia_core ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}" + echo "" + echo -e "${BLUE}This playbook will:${NC}" + echo -e "• Update input files" + echo -e "• Update internal configurations" + echo "" + echo -e "${YELLOW}Note: Run this command after the container is fully healthy and stable${NC}" + echo -e "${YELLOW}================================================================================${NC}" + echo "" +} + +# Host-side paths (initialized dynamically after omnia_path is set) +OMNIA_INPUT_DIR="" +OMNIA_METADATA_DIR="" +OMNIA_METADATA_FILE="" + +update_metadata_upgrade_backup_dir() { + local backup_dir="$1" + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running" + return 1 + fi + + podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^upgrade_backup_dir:' '$CONTAINER_METADATA_FILE'; then + sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$CONTAINER_METADATA_FILE' + else + echo 'upgrade_backup_dir: ${backup_dir}' >> '$CONTAINER_METADATA_FILE' + fi + " +} + +# Resolve the upgrade guard lock path (container or host shared path) +get_upgrade_guard_lock_path() { + local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" + local upgrade_guard_lock_host + upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -n "$upgrade_guard_lock_host" ]; then + upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" + else + upgrade_guard_lock_host="$upgrade_guard_lock_container" + fi + echo "$upgrade_guard_lock_host" +} + check_internal_nfs_export() { nfs_server_ip=$1 nfs_server_share_path=$2 @@ -130,7 +390,7 @@ setup_omnia_core() { # It removes the container and performs the necessary cleanup steps. cleanup_omnia_core() { # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" @@ -149,6 +409,11 @@ cleanup_omnia_core() { # Fetch the configuration from the Omnia core container. fetch_config + # Clear upgrade guard lock if present (shared path visible to container and host) + local upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_path" + # Remove the container remove_container @@ -238,7 +503,7 @@ cleanup_config(){ # Otherwise, it prints an error message. remove_container() { # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" @@ -530,6 +795,11 @@ init_container_config() { # Create the pulp_ha directory if it does not exist. echo -e "${GREEN} Creating the pulp HA directory if it does not exist.${NC}" mkdir -p "$omnia_path/omnia/pulp/pulp_ha" + + # Initialize host-side path variables based on user-provided omnia_path + OMNIA_INPUT_DIR="$omnia_path/omnia/input" + OMNIA_METADATA_DIR="$omnia_path/omnia/.data" + OMNIA_METADATA_FILE="$omnia_path/omnia/.data/oim_metadata.yml" } @@ -587,6 +857,11 @@ fetch_config() { else echo -e "${GREEN} Successfully fetched data from metadata file.${NC}" fi + + # Initialize host-side path variables based on fetched omnia_path + OMNIA_INPUT_DIR="$omnia_path/omnia/input" + OMNIA_METADATA_DIR="$omnia_path/omnia/.data" + OMNIA_METADATA_FILE="$omnia_path/omnia/.data/oim_metadata.yml" } # Validates the OIM (Omnia Infrastructure Manager) by checking if the hostname is @@ -722,7 +997,7 @@ Description=${container_name^} Container [Container] ContainerName=${container_name} HostName=${container_name} -Image=${container_name}:1.1 +Image=${container_name}:2.1 Network=host # Capabilities @@ -757,9 +1032,9 @@ EOF # Create the .data directory if it does not exist. # This is where the oim_metadata.yml file is stored. echo -e "${GREEN} Creating the .data directory if it does not exist.${NC}" - mkdir -p "$omnia_path/omnia/.data" + mkdir -p "$OMNIA_METADATA_DIR" - oim_metadata_file="$omnia_path/omnia/.data/oim_metadata.yml" + oim_metadata_file="$OMNIA_METADATA_FILE" if [ ! -f "$oim_metadata_file" ]; then echo -e "${GREEN} Creating oim_metadata file${NC}" @@ -781,6 +1056,13 @@ EOF echo "nfs_type: $nfs_type" } >> "$oim_metadata_file" fi + else + sed -i '/^upgrade_backup_dir:/d' "$oim_metadata_file" >/dev/null 2>&1 || true + if grep -q '^omnia_version:' "$oim_metadata_file"; then + sed -i "s/^omnia_version:.*/omnia_version: $omnia_release/" "$oim_metadata_file" >/dev/null 2>&1 || true + else + echo "omnia_version: $omnia_release" >> "$oim_metadata_file" + fi fi # --- Remove old service if exists --- @@ -811,7 +1093,7 @@ EOF if ! podman ps --format '{{.Names}}' | grep -qw "$container_name"; then echo -e "${RED}Error: $container_name container failed to start.${NC}" - rm -rf "$omnia_path/omnia/.data/oim_metadata.yml" + rm -rf "$OMNIA_METADATA_FILE" exit 1 fi @@ -832,17 +1114,17 @@ post_setup_config() { chmod 757 "$omnia_path/omnia/tmp/.ansible/tmp" # Create the input directory if it does not exist. echo -e "${GREEN} Creating the input directory if it does not exist.${NC}" - mkdir -p "$omnia_path/omnia/input/" + mkdir -p "$OMNIA_INPUT_DIR/" # Create the default.yml file if it does not exist. # This file contains the name of the project. - if [ ! -f "$omnia_path/omnia/input/default.yml" ]; then + if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then echo -e "${BLUE} Creating default.yml file.${NC}" { echo "# This file defines the project name." echo "# The name of the project should be set in a directory under input." echo "project_name: project_default" - } >> "$omnia_path/omnia/input/default.yml" + } >> "$OMNIA_INPUT_DIR/default.yml" fi # Copy input files from /omnia to /opt/omnia/project_default/ inside omnia_core container @@ -880,6 +1162,7 @@ validate_nfs_server() { } init_ssh_config() { + mkdir -p "$HOME/.ssh" touch $HOME/.ssh/known_hosts # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 # Remove existing entry if it exists @@ -920,71 +1203,43 @@ start_container_session() { -------------------------------------------------------------------------------------------------------------------------------------------------- ${NC}" + init_ssh_config + # Entering Omnia-core container ssh omnia_core } show_help() { - echo "Usage: $0 [--install | --uninstall | --version | --help]" + echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]" echo " -i, --install Install and start the Omnia core container" echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" + echo " --upgrade Upgrade the Omnia core container to newer version + echo " --rollback Rollback the Omnia core container to previous version echo " -v, --version Display Omnia version information" echo " -h, --help More information about usage" } install_omnia_core() { - local omnia_core_tag="1.1" - local omnia_core_registry="docker.io/dellhpcomniaaisolution" - - # Check if local omnia_core:1.1 exists - if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" - # Check if latest exists for backward compatibility - elif podman inspect omnia_core:latest >/dev/null 2>&1; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}" - # Tag it as 1.1 for consistency - podman tag omnia_core:latest omnia_core:${omnia_core_tag} - else - # Try pulling from Docker Hub with retry logic - echo -e "${BLUE}Omnia core image not found locally. Attempting to pull from Docker Hub...${NC}" - pull_success=false - max_retries=3 - retry_count=0 - - while [ $retry_count -lt $max_retries ]; do - retry_count=$((retry_count + 1)) - echo -e "${BLUE}Attempt $retry_count of $max_retries...${NC}" - - if podman pull ${omnia_core_registry}/omnia_core:${omnia_core_tag} 2>/dev/null; then - echo -e "${GREEN}✓ Successfully pulled omnia_core:${omnia_core_tag} from Docker Hub.${NC}" - # Tag it without registry prefix for local use - podman tag ${omnia_core_registry}/omnia_core:${omnia_core_tag} omnia_core:${omnia_core_tag} - pull_success=true - break - else - if [ $retry_count -lt $max_retries ]; then - echo -e "${YELLOW}Pull failed. Retrying in 5 seconds...${NC}" - sleep 5 - fi - fi - done - - if [ "$pull_success" = false ]; then - echo -e "${RED}ERROR: Failed to pull omnia_core image after $max_retries attempts.${NC}" - echo "" - echo -e "${YELLOW}To resolve this, please follow these steps:${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally:" - echo -e " ./build_images.sh core omnia_branch=" - echo -e "4. After building the image, re-run this script:" - echo -e " ./omnia.sh --install" + # Detect existing Omnia 2.0 installation + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + # Read version from metadata inside container + current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + if [ "$current_version" = "2.0.0.0" ]; then + echo -e "${RED}ERROR: Existing Omnia 2.0 installation detected.${NC}" + echo -e "${YELLOW}To upgrade, run: $0 --upgrade${NC}" + echo -e "${YELLOW}For a fresh install, first run: $0 --uninstall${NC}" exit 1 fi fi + local omnia_core_tag="2.1" + local omnia_core_registry="" + + # Check if local omnia_core image exists using validate function + if validate_container_image "" "$omnia_core_tag" "install"; then + echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" + fi + # Check if any other containers with 'omnia' in their name are running other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core') @@ -1039,7 +1294,7 @@ install_omnia_core() { # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function if [ "$choice" = "2" ]; then # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" @@ -1104,9 +1359,6 @@ install_omnia_core() { # If core container is not present else - - # Start the container setup - echo -e "${GREEN}Starting Omnia core container setup.${NC}" setup_omnia_core fi } @@ -1139,6 +1391,942 @@ display_version() { exit 0 } +phase1_validate() { + local current_image + local core_config + local previous_omnia_version + local shared_path + + echo "[INFO] [ORCHESTRATOR] Phase 1: Pre-Upgrade Validation" + + if [ "$(id -u)" -ne 0 ]; then + if ! sudo -n true >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: run as root or configure passwordless sudo" + return 1 + fi + fi + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running" + display_cleanup_instructions + return 1 + fi + + core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml' 2>/dev/null) + if [ -z "$core_config" ]; then + echo "[ERROR] [ORCHESTRATOR] Unable to read oim_metadata.yml from omnia_core container" + return 1 + fi + + previous_omnia_version=$(echo "$core_config" | grep "^omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r') + if [ -z "$previous_omnia_version" ]; then + echo "[ERROR] [ORCHESTRATOR] omnia_version not found in oim_metadata.yml" + return 1 + fi + + shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -z "$shared_path" ]; then + echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml" + return 1 + fi + + omnia_path="$shared_path" + + if [ ! -d "$omnia_path" ]; then + echo "[ERROR] [ORCHESTRATOR] Shared path from metadata does not exist on host: $omnia_path" + return 1 + fi + + if [ ! -w "$omnia_path" ]; then + echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on shared path: $omnia_path" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed" + return 0 +} + +phase2_approval() { + local backup_base default_backup_dir current_omnia_version + + echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate" + echo "============================================" + echo "OMNIA UPGRADE SUMMARY" + echo "============================================" + echo "Current Container Tag: $OMNIA_CORE_CONTAINER_TAG" + echo "Target Container Tag: $TARGET_CONTAINER_TAG" + echo "Current Omnia Release: $OMNIA_VERSION" + echo "Target Omnia Release: $TARGET_OMNIA_VERSION" + + # Show upgrade type + if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then + echo "Upgrade Type: Same-tag upgrade (container restart)" + else + echo "Upgrade Type: Cross-tag upgrade (container swap)" + fi + + echo "============================================" + + current_omnia_version=$(podman exec -u root omnia_core /bin/bash -c "grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r'" 2>/dev/null) + if [ -z "$current_omnia_version" ]; then + echo "[ERROR] [ORCHESTRATOR] Failed to read omnia_version from metadata inside container" + return 1 + fi + + default_backup_dir="$CONTAINER_BACKUPS_DIR/upgrade/version_${current_omnia_version}" + backup_base="$default_backup_dir" + + echo "[INFO] [ORCHESTRATOR] Backup destination (inside omnia_core container): $backup_base" + + if ! update_metadata_upgrade_backup_dir "$backup_base"; then + echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata" + return 1 + fi + + read -p "Proceed with upgrade? (y/N): " confirm + if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then + echo "[INFO] [ORCHESTRATOR] Upgrade cancelled by user" + return 1 + fi + + OMNIA_UPGRADE_BACKUP_PATH="$backup_base" + export OMNIA_UPGRADE_BACKUP_PATH + + echo "[INFO] [ORCHESTRATOR] Phase 2: Approval granted" + return 0 +} + +phase3_backup_creation() { + local backup_base="$1" + + echo "[INFO] [ORCHESTRATOR] Phase 3: Backup Creation" + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Cannot create backup because omnia_core is not running" + return 1 + fi + + if [ -z "$backup_base" ]; then + echo "[ERROR] [ORCHESTRATOR] Backup destination is empty" + return 1 + fi + + if ! podman exec -u root omnia_core bash -c " + set -e + rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' + mkdir -p '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' + + if [ -f '$CONTAINER_INPUT_DIR/default.yml' ]; then + cp -a '$CONTAINER_INPUT_DIR/default.yml' '${backup_base%/}/input/' + fi + + if [ -d '$CONTAINER_INPUT_DIR/project_default' ]; then + cp -a '$CONTAINER_INPUT_DIR/project_default' '${backup_base%/}/input/' + fi + + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + cp -a '$CONTAINER_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml' + "; then + echo "[ERROR] [ORCHESTRATOR] Backup failed; cleaning up partial backup" + podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true + return 1 + fi + + if [ -f "/etc/containers/systemd/omnia_core.container" ]; then + if ! podman cp "/etc/containers/systemd/omnia_core.container" "omnia_core:${backup_base%/}/configs/omnia_core.container" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Failed to backup quadlet container file" + podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true + return 1 + fi + fi + + echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_base" + echo "[INFO] [ORCHESTRATOR] Phase 3: Backup completed" + return 0 +} + +phase4_same_tag_upgrade() { + local target_version="$1" + + echo "[INFO] [ORCHESTRATOR] Phase 4: Same-Tag Upgrade" + echo "[INFO] [ORCHESTRATOR] Upgrading to $target_version within same container tag" + + # Verify container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Container is not running for same-tag upgrade" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Updating metadata to version $target_version" + + # Update version metadata + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ORCHESTRATOR] Failed to update metadata version" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Restarting container to apply changes..." + + # Restart container to apply changes + if ! systemctl restart omnia_core.service; then + echo "[ERROR] [ORCHESTRATOR] Failed to restart container service" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container restart failed" + return 1 + fi + + # Wait for container to be healthy after restart + echo "[INFO] [ORCHESTRATOR] Waiting for container health check after restart (30s)" + local health_timeout=30 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ORCHESTRATOR] Container is healthy after restart" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo "[ERROR] [ORCHESTRATOR] Container failed to become healthy within 30 seconds after restart" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container health check failed" + return 1 + fi + + # Verify version update + local updated_version=$(get_current_omnia_version) + if [ "$updated_version" != "$target_version" ]; then + echo "[ERROR] [ORCHESTRATOR] Version update verification failed" + echo "[ERROR] [ORCHESTRATOR] Expected: $target_version, Found: $updated_version" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Same-tag upgrade completed successfully" + echo "[INFO] [ORCHESTRATOR] Version updated to: $target_version" + + show_post_upgrade_instructions "$target_version" + + return 0 +} + +phase4_container_swap() { + local quadlet_file="/etc/containers/systemd/omnia_core.container" + local i + + echo "[INFO] [ORCHESTRATOR] Phase 4: Container Swap" + + if [ ! -f "$quadlet_file" ]; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Stopping omnia_core $OMNIA_CORE_CONTAINER_TAG container" + systemctl stop omnia_core.service >/dev/null 2>&1 || true + + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[WARN] [ORCHESTRATOR] omnia_core still running; forcing stop" + podman stop -t 30 omnia_core >/dev/null 2>&1 || true + fi + + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop $OMNIA_CORE_CONTAINER_TAG container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Starting omnia_core $TARGET_CONTAINER_TAG Quadlet unit" + if ! podman inspect "omnia_core:$TARGET_CONTAINER_TAG" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:$TARGET_CONTAINER_TAG" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG image not available" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + fi + + if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$TARGET_CONTAINER_TAG/" "$quadlet_file"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to $TARGET_CONTAINER_TAG in quadlet file" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + fi + + systemctl daemon-reload || { + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + } + + systemctl start omnia_core.service || { + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start $TARGET_CONTAINER_TAG container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + } + + echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core $TARGET_CONTAINER_TAG health check (60s)" + for i in $(seq 1 60); do + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + break + fi + sleep 1 + done + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG container failed health check" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to $TARGET_OMNIA_VERSION" + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: $TARGET_OMNIA_VERSION/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: $TARGET_OMNIA_VERSION' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." + rollback_omnia_core + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed" + return 0 +} + +upgrade_omnia_core() { + echo -e "${BLUE}=================== Omnia Core Upgrade ====================${NC}" + echo -e "${BLUE}This script will upgrade Omnia core container.${NC}" + echo -e "${BLUE}Current version will be backed up and upgraded to target version.${NC}" + echo -e "${BLUE}=============================================================${NC}" + + # Read current version + OMNIA_VERSION=$(get_current_omnia_version) + if [ -z "$OMNIA_VERSION" ]; then + echo -e "${RED}ERROR: Could not determine current Omnia version${NC}" + echo -e "${YELLOW}Please ensure omnia_core container is running and metadata is accessible${NC}" + exit 1 + fi + + # Get current container tag + OMNIA_CORE_CONTAINER_TAG=$(get_container_tag_from_version "$OMNIA_VERSION") + + echo -e "${GREEN}Current Omnia version: $OMNIA_VERSION${NC}" + echo -e "${GREEN}Current container tag: $OMNIA_CORE_CONTAINER_TAG${NC}" + + # Show available upgrade options + echo "" + echo "Available upgrade options:" + echo "=========================" + + # Get available upgrade versions dynamically + local upgrade_output + upgrade_output=$(get_available_upgrade_versions "$OMNIA_VERSION") + + # Parse output into versions and descriptions + local available_versions=() + local version_descriptions=() + local line_count=0 + local total_lines + + # Count total lines + total_lines=$(echo "$upgrade_output" | wc -l) + + # Split into versions and descriptions (first half = versions, second half = descriptions) + local mid_line=$((total_lines / 2)) + local line_num=0 + + while IFS= read -r line; do + line_num=$((line_num + 1)) + if [ $line_num -le $mid_line ]; then + available_versions+=("$line") + else + version_descriptions+=("$line") + fi + done <<< "$upgrade_output" + + # Check if any upgrade options are available + if [ ${#available_versions[@]} -eq 0 ]; then + echo -e "${GREEN}Already at latest version $OMNIA_VERSION${NC}" + echo "No upgrade options available." + exit 0 + fi + + # Display upgrade options + for i in "${!available_versions[@]}"; do + local target_version="${available_versions[$i]}" + local target_container_tag=$(get_container_tag_from_version "$target_version") + + # Check if target image exists locally + local image_status="✓ Available" + if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then + image_status="✗ Missing (build required)" + fi + + echo "$((i+1)). Upgrade to $target_version (container tag: $target_container_tag) [$image_status]" + done + + # Prompt user to select upgrade version + echo -n "Select upgrade option (1-${#available_versions[@]}) or press Enter to cancel: " + read -r selection + + # Validate selection + if [ -z "$selection" ]; then + echo "Upgrade cancelled by user." + exit 0 + fi + + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then + echo -e "${RED}ERROR: Invalid selection.${NC}" + exit 1 + fi + + # Set target version based on user selection + TARGET_OMNIA_VERSION="${available_versions[$((selection-1))]}" + TARGET_CONTAINER_TAG=$(get_container_tag_from_version "$TARGET_OMNIA_VERSION") + + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then + exit 1 + fi + + echo -e "${GREEN}Target Omnia version: $TARGET_OMNIA_VERSION${NC}" + echo -e "${GREEN}Target container tag: $TARGET_CONTAINER_TAG${NC}" + + # Check if container tag change is needed + if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then + echo -e "${BLUE}Upgrade within same container tag ($TARGET_CONTAINER_TAG)${NC}" + echo -e "${BLUE}Will restart container instead of swapping${NC}" + SAME_TAG_UPGRADE=true + else + echo -e "${BLUE}Container tag change required ($OMNIA_CORE_CONTAINER_TAG -> $TARGET_CONTAINER_TAG)${NC}" + echo -e "${BLUE}Will perform full container swap${NC}" + SAME_TAG_UPGRADE=false + fi + + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then + exit 1 + fi + local lock_file="/tmp/omnia_upgrade.lock" + if [ -f "$lock_file" ]; then + echo -e "${RED}ERROR: Another upgrade process is already running${NC}" + echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}" + exit 1 + fi + touch "$lock_file" + trap 'rm -f "$lock_file"' EXIT + + # Create upgrade guard lock in shared path so other playbooks can block during upgrade + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + + mkdir -p "$(dirname "$upgrade_guard_lock_path")" 2>/dev/null || true + echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_path" || { + echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_path${NC}" + exit 1 + } + + # Run upgrade phases + if ! phase1_validate; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" + exit 1 + fi + + if ! phase2_approval; then + exit 0 + fi + + local backup_base="$OMNIA_UPGRADE_BACKUP_PATH" + if [ -z "$backup_base" ]; then + echo "[ERROR] [ORCHESTRATOR] Backup path is empty" + exit 1 + fi + + if ! phase3_backup_creation "$backup_base"; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3" + exit 1 + fi + + # Choose upgrade path based on container tag + if [ "$SAME_TAG_UPGRADE" = "true" ]; then + if ! phase4_same_tag_upgrade "$TARGET_OMNIA_VERSION"; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in same-tag upgrade" + exit 1 + fi + else + if ! phase4_container_swap; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" + exit 1 + fi + fi + + echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" + echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base" + + # Seed inputs and defaults after upgrade + post_setup_config + + show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" + # Initialize SSH config and start container session + init_ssh_config + start_container_session + exit 0 +} + +# Validate backup directory structure and files +validate_backup_directory() { + local backup_path="$1" + + echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path" + + # Check if backup directory exists + if ! podman exec -u root omnia_core test -d "$backup_path"; then + echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path" + return 1 + fi + + # Check for required subdirectories + for subdir in input metadata configs; do + if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then + echo "[ERROR] [ROLLBACK] Missing required subdirectory: $backup_path/$subdir" + return 1 + fi + done + + # Check for required files + if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then + echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml" + return 1 + fi + + if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then + echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container" + return 1 + fi + + # Verify metadata contains version information + if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then + echo "[ERROR] [ROLLBACK] Metadata file does not contain version information" + return 1 + fi + + echo "[INFO] [ROLLBACK] Backup validation successful" + return 0 +} + +# Stop container gracefully with timeout +stop_container_gracefully() { + local container_name="$1" + local timeout="${2:-30}" + + echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..." + + # Try graceful stop first + if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then + echo "[INFO] [ROLLBACK] Container stopped gracefully" + return 0 + fi + + # Check if container is still running + if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then + echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..." + if podman stop "$container_name" >/dev/null 2>&1; then + echo "[INFO] [ROLLBACK] Container force stopped" + return 0 + else + echo "[ERROR] [ROLLBACK] Failed to stop container" + return 1 + fi + fi + + return 0 +} + +# Restore files from backup +restore_from_backup() { + local backup_path="$1" + + echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path" + + # Restore input files + if ! podman exec -u root omnia_core bash -c " + set -e + rm -rf /opt/omnia/input + cp -a '$backup_path/input' /opt/omnia/ + "; then + echo "[ERROR] [ROLLBACK] Failed to restore input files" + return 1 + fi + + # Restore metadata + if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then + echo "[ERROR] [ROLLBACK] Failed to restore metadata" + return 1 + fi + + # Restore container config on host + if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then + echo "[ERROR] [ROLLBACK] Failed to restore container config" + return 1 + fi + + echo "[INFO] [ROLLBACK] Files restored successfully" + return 0 +} + +# Display cleanup instructions for failed upgrade/rollback +display_cleanup_instructions() { + echo "" + echo -e "${RED}================================================================================${NC}" + echo -e "${RED} UPGRADE/ROLLBACK FAILED${NC}" + echo -e "${RED}================================================================================${NC}" + echo "" + echo -e "${YELLOW}Operation failed. Manual cleanup is required to restore a clean state before retrying.${NC}" + echo "" + echo -e "${BLUE}Choose the appropriate cleanup scenario:${NC}" + echo "" + echo -e "${GREEN}CASE 1: If you can log into omnia_core container:${NC}" + echo -e "${YELLOW}1. Enter omnia_core container: podman exec -it omnia_core bash${NC}" + echo -e "${YELLOW}2. Run oim cleanup: ansible-playbook /omnia/oim_cleanup.yml${NC}" + echo -e "${YELLOW}3. Run uninstall inside container: ./omnia.sh --uninstall${NC}" + echo -e "${YELLOW}4. Exit container: exit${NC}" + echo -e "${YELLOW}5. Clean shared path: rm -rf ${NC}" + echo -e "${YELLOW}6. Install required version: ./omnia.sh --install${NC}" + echo "" + echo -e "${GREEN}CASE 2: If you cannot log into omnia_core container (but other containers are running):${NC}" + echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}" + echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}" + echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}" + echo -e "${YELLOW}4. Stop all containers: podman stop $(podman ps -aq)${NC}" + echo -e "${YELLOW}5. Remove all containers: podman rm -f $(podman ps -aq)${NC}" + echo -e "${YELLOW}6. Clean shared path: rm -rf ${NC}" + echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}" + echo "" + echo -e "${BLUE}Note: Replace with your actual Omnia shared path.${NC}" + echo "" +} + +rollback_omnia_core() { + echo -e "${GREEN}================================================================================${NC}" + echo -e "${GREEN} OMNIA CORE ROLLBACK${NC}" + echo -e "${GREEN}================================================================================${NC}" + echo "" + + # Audit log start + local rollback_start=$(date -Iseconds) + echo "[AUDIT] Rollback operation started at: $rollback_start" + + # Check if omnia_core container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo -e "${RED}ERROR: Omnia core container is not running.${NC}" + exit 1 + fi + + # Create lock file to prevent concurrent rollbacks + local lock_file="/tmp/omnia_rollback.lock" + if [ -f "$lock_file" ]; then + local existing_pid + existing_pid=$(cat "$lock_file" 2>/dev/null | tr -d ' \t\n\r') + + if [ -n "$existing_pid" ] && kill -0 "$existing_pid" >/dev/null 2>&1; then + echo -e "${RED}ERROR: Another rollback process is already running (PID: $existing_pid)${NC}" + echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}" + exit 1 + fi + + if [ -n "$existing_pid" ]; then + echo -e "${YELLOW}[WARN] Stale rollback lock file found (PID: $existing_pid); removing: $lock_file${NC}" + fi + rm -f "$lock_file" >/dev/null 2>&1 || true + fi + + echo "$$" > "$lock_file" + trap 'rm -f "$lock_file"' EXIT INT TERM + + # Get current version + if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then + echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}" + exit 1 + fi + + local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + + # Get available rollback versions dynamically + local rollback_versions + rollback_versions=$(get_available_rollback_versions "$current_version") + + # Convert to array + local available_versions=() + while IFS= read -r line; do + available_versions+=("$line") + done <<< "$rollback_versions" + + # Check if any rollback options are available + if [ ${#available_versions[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No rollback versions available from $current_version.${NC}" + exit 1 + fi + + echo "" + echo "Available rollback versions:" + echo "===========================" + for i in "${!available_versions[@]}"; do + local version="${available_versions[$i]}" + local container_tag=$(get_container_tag_from_version "$version") + + # Check if target image exists locally + local image_status="✓ Available" + if ! podman inspect "omnia_core:$container_tag" >/dev/null 2>&1; then + image_status="✗ Missing (build required)" + fi + + echo " $((i+1)). Rollback to version $version (container tag: $container_tag) [$image_status]" + done + + # Prompt for rollback selection + echo "" + echo -n "Select rollback version (1-${#available_versions[@]}): " + read -r selection + + # Validate selection + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then + echo -e "${RED}ERROR: Invalid selection.${NC}" + exit 1 + fi + + local selected_version="${available_versions[$((selection-1))]}" + local selected_container_tag=$(get_container_tag_from_version "$selected_version") + + echo "" + echo "Selected rollback: Version $selected_version" + echo -n "Are you sure you want to rollback to version $selected_version? [y/N]: " + read -r confirm + + if [[ ! "$confirm" =~ ^[yY] ]]; then + echo "Rollback cancelled by user." + exit 0 + fi + + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$selected_version" "$selected_container_tag" "rollback"; then + exit 1 + fi + + # Check if container tag change is needed + local current_container_tag=$(get_container_tag_from_version "$current_version") + if [ "$current_container_tag" = "$selected_container_tag" ]; then + echo -e "${BLUE}Rollback within same container tag ($selected_container_tag)${NC}" + echo -e "${BLUE}Will restart container instead of swapping${NC}" + + # Perform same-tag rollback (container restart only) + if ! rollback_same_tag "$selected_version" "$current_version"; then + echo "[ERROR] [ROLLBACK] Rollback failed in same-tag rollback" + exit 1 + fi + + echo "[INFO] [ROLLBACK] Rollback completed successfully" + echo "[INFO] [ROLLBACK] Version rolled back to: $selected_version" + exit 0 + else + echo -e "${BLUE}Container tag change required ($current_container_tag -> $selected_container_tag)${NC}" + echo -e "${BLUE}Will perform full container swap${NC}" + # Continue with existing container swap logic + fi + + # List available backups for selected version + echo "[INFO] [ROLLBACK] Scanning for available backups for version $selected_version..." + local backup_dirs=() + while IFS= read -r line; do + backup_dirs+=("$line") + done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_${selected_version}*" 2>/dev/null | sort -r) + + if [ ${#backup_dirs[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No backup directories found for version $selected_version.${NC}" + exit 1 + fi + + echo "" + echo "Available backups for version $selected_version:" + for i in "${!backup_dirs[@]}"; do + local backup_path="${backup_dirs[$i]}" + local backup_date=$(podman exec -u root omnia_core stat -c '%y' "$backup_path" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) + echo " $((i+1)). Backup created: $backup_date" + done + + # Prompt for backup selection + echo "" + echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " + read -r backup_selection + + # Validate backup selection + if ! [[ "$backup_selection" =~ ^[0-9]+$ ]] || [ "$backup_selection" -lt 1 ] || [ "$backup_selection" -gt ${#backup_dirs[@]} ]; then + echo -e "${RED}ERROR: Invalid backup selection.${NC}" + exit 1 + fi + + local selected_backup="${backup_dirs[$((backup_selection-1))]}" + + # Validate selected backup exists + if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then + echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}" + exit 1 + fi + + echo "" + echo "[INFO] [ROLLBACK] Starting rollback process..." + + # Step 1: Stop current container gracefully + echo "" + echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core $current_container_tag container..." + if ! stop_container_gracefully "omnia_core" 30; then + echo -e "${RED}ERROR: Failed to stop container.${NC}" + display_cleanup_instructions + exit 1 + fi + + # Step 2: Update Quadlet file to use target container tag + echo "" + echo "[INFO] [ROLLBACK] Step 2: Updating Quadlet file to use container tag $selected_container_tag..." + local quadlet_file="/etc/containers/systemd/omnia_core.container" + + if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$selected_container_tag/" "$quadlet_file"; then + echo -e "${RED}ERROR: Failed to update Image to $selected_container_tag in quadlet file${NC}" + display_cleanup_instructions + exit 1 + fi + + echo "[INFO] [ROLLBACK] Quadlet file updated to use omnia_core:$selected_container_tag" + + # Step 3: Start target container + echo "" + echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core $selected_container_tag container..." + systemctl daemon-reload + if ! systemctl start omnia_core.service; then + echo -e "${RED}ERROR: Failed to start container service.${NC}" + display_cleanup_instructions + exit 1 + fi + + # Step 4: Wait for container to be healthy + echo "" + echo "[INFO] [ROLLBACK] Step 4: Waiting for container to be healthy..." + local health_timeout=60 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ROLLBACK] Container is healthy" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}" + display_cleanup_instructions + exit 1 + fi + + # Step 5: Validate backup directory structure + echo "" + echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..." + if ! validate_backup_directory "$selected_backup"; then + echo -e "${RED}ERROR: Backup validation failed.${NC}" + display_cleanup_instructions + exit 1 + fi + + # Step 6: Restore files from backup + echo "" + echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..." + if ! restore_from_backup "$selected_backup"; then + echo -e "${RED}ERROR: Failed to restore from backup.${NC}" + display_cleanup_instructions + exit 1 + fi + + # Step 7: Verify container version + echo "" + echo "[INFO] [ROLLBACK] Step 7: Verifying container version..." + local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + + if [ "$verify_version" != "$selected_version" ]; then + echo -e "${RED}ERROR: Version verification failed. Expected: $selected_version, Found: $verify_version${NC}" + display_cleanup_instructions + exit 1 + fi + + # Audit log end + local rollback_end=$(date -Iseconds) + echo "[AUDIT] Rollback operation completed at: $rollback_end" + echo "[AUDIT] Rolled back from version $current_version to $selected_version" + + echo "" + echo -e "${GREEN}================================================================================${NC}" + echo -e "${GREEN} ROLLBACK COMPLETED SUCCESSFULLY${NC}" + echo -e "${GREEN}================================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Omnia core has been rolled back to version $selected_version${NC}" + echo -e "${GREEN}✓ Container is running and healthy${NC}" + echo -e "${GREEN}✓ Configuration restored from backup${NC}" + echo "" + + # Clean up lock file before starting long-running ssh session + rm -f "$lock_file" >/dev/null 2>&1 || true + echo "[INFO] Rollback lock file removed before starting container session" + + # Clear upgrade guard lock if it exists (shared path visible to container and host) + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_path" + + # Initialize SSH config and start container session + init_ssh_config + start_container_session +} + # Main function to check if omnia_core container is already running. # If yes, ask the user if they want to enter the container or reinstall. # If no, set it up. @@ -1150,6 +2338,12 @@ main() { --uninstall|-u) cleanup_omnia_core ;; + --upgrade) + upgrade_omnia_core + ;; + --rollback) + rollback_omnia_core + ;; --version|-v) display_version ;; @@ -1165,4 +2359,4 @@ main() { } # Call the main function -main "$1" +main "$1" \ No newline at end of file diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 49bead531f..f5ea607994 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local @@ -63,6 +66,11 @@ name: deploy_containers/common tasks_from: add_known_hosts.yml + - name: Download aarch64 prerequisites # noqa:role-name[path] + ansible.builtin.include_role: + name: deploy_containers/common + tasks_from: aarch64_prereq.yml + - name: OpenLDAP Pre_req generate ssha password hosts: localhost connection: local @@ -97,6 +105,17 @@ name: deploy_containers/openchami # noqa:role-name[path] tasks_from: verify_openchami.yml +- name: OpenCHAMI deployment prereq + hosts: oim + connection: ssh + gather_facts: false + tags: openchami + tasks: + - name: Pull OpenCHAMI images + ansible.builtin.include_role: + name: deploy_containers/openchami # noqa:role-name[path] + tasks_from: deployment_prereq.yml + - name: Deploy the openchami container hosts: localhost connection: local @@ -145,6 +164,11 @@ name: deploy_containers/common tasks_from: omnia_service.yml + - name: Install required packages # noqa:role-name[path] + ansible.builtin.include_role: + name: deploy_containers/common + tasks_from: package_installation.yml + - name: Prepare oim completion hosts: localhost connection: local diff --git a/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml new file mode 100644 index 0000000000..f5eae768bb --- /dev/null +++ b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml @@ -0,0 +1,26 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Create openchami aarch64 directory if not exists + ansible.builtin.file: + path: "{{ ochami_aarch64_dir }}" + state: directory + mode: "{{ dir_permissions_755 }}" + +- name: Download regctl binary (aarch64) + ansible.builtin.get_url: + url: "{{ regctl_aarch64_url }}" + dest: "{{ ochami_aarch64_dir }}/regctl" + mode: "{{ dir_permissions_755 }}" diff --git a/prepare_oim/roles/deploy_containers/common/tasks/main.yml b/prepare_oim/roles/deploy_containers/common/tasks/main.yml index 78c28e98ba..00287c628c 100644 --- a/prepare_oim/roles/deploy_containers/common/tasks/main.yml +++ b/prepare_oim/roles/deploy_containers/common/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml new file mode 100644 index 0000000000..1d84877307 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml @@ -0,0 +1,29 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install required packages + block: + - name: Install required packages + ansible.builtin.package: + name: "{{ item }}" + state: present + loop: "{{ oim_packages }}" + register: oim_pkg_result + rescue: + - name: Fail if required package installation fails + ansible.builtin.fail: + msg: >- + {{ prepare_oim_pkg_fail_msg.splitlines() | join(' ') }} + Failed package(s): {{ oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='item') | list | join(', ') }} + Error: {{ (oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='msg') | list | first) | default('') }} diff --git a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml index 7c86cfaf6b..52e4009219 100644 --- a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml +++ b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,6 +32,24 @@ mode: "{{ file_permissions }}" when: not bmc_group_data_status.stat.exists +- name: Clone iDRAC Telemetry Scripting repository + block: + - name: Checkout iDRAC Telemetry GitHub repo + ansible.builtin.git: + repo: "{{ idrac_telemetry_scripting_repo }}" + dest: "{{ idrac_telemetry_scripting_clone_dest }}" + version: "{{ idrac_telemetry_scripting_stable_commit }}" + update: false + register: clone_idrac_script + until: clone_idrac_script is succeeded + retries: "{{ max_retries }}" + delay: "{{ delay_count }}" + rescue: + - name: Fail if iDRAC telemetry Git clone fails + ansible.builtin.fail: + msg: "{{ idrac_script_git_clone_fail_msg.splitlines() | join(' ') }}" + when: clone_idrac_script is failed + - name: Prepare oim completion ansible.builtin.debug: msg: "{{ prepare_oim_completion_msg.splitlines() | join(' ') }}" diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml index 30bb7b8125..855e7350b1 100644 --- a/prepare_oim/roles/deploy_containers/common/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,12 +28,34 @@ internal_nfs_services: ntp_firewall_service: ntp +# Packages required on OIM +oim_packages: + - nfs-utils + - nfs4-acl-tools + - git + - make +prepare_oim_pkg_fail_msg: | + Failed to install required packages. Please ensure the repository is + configured on OIM and rerun the playbook. + # Usage: prepare_oim_completion.yml telemetry_dir: "/opt/omnia/telemetry" dir_permissions_755: "0755" bmc_group_data_filename: "{{ telemetry_dir }}/bmc_group_data.csv" bmc_group_data_template: "bmc_group_data.j2" file_permissions: "0644" +idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git" +idrac_telemetry_scripting_stable_commit: "f6999f5" +idrac_telemetry_scripting_clone_dest: "{{ telemetry_dir }}/iDRAC-Telemetry-Scripting" +max_retries: 10 +delay_count: 5 +git_install_timeout: 300 +git_install_fail_msg: | + Failed to install git. Please ensure the OS repository is configured on OIM. + Configure the repository and rerun the playbook. +idrac_script_git_clone_fail_msg: | + Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }} + to {{ idrac_telemetry_scripting_clone_dest }}. Please check network connectivity and rerun the playbook. prepare_oim_completion_msg: | The playbook prepare_oim.yml has completed successfully. To create the offline repositories and registry for the cluster nodes, please execute the playbook local_repo/local_repo.yml as the next step. @@ -58,3 +80,7 @@ network_services: # Usage: configure_chrony.yml chrony_conf_path: "/etc/chrony.conf" chrony_no_sources_msg: "No chrony sources are reachable. Please give a valid NTP server configuration in network_spec.yml and re-run prepare_oim playbook." + +# Usage: aarch64_prereq.yml +ochami_aarch64_dir: "/opt/omnia/openchami/aarch64" +regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml new file mode 100644 index 0000000000..1558152a50 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml @@ -0,0 +1,41 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if OpenCHAMI images already exist + ansible.builtin.command: + cmd: "podman image exists {{ item }}" + loop: "{{ openchami_images }}" + register: openchami_image_exists + changed_when: false + failed_when: false + +- name: Pull OpenCHAMI images using Podman when missing + ansible.builtin.command: + cmd: "podman pull {{ item.item }}" + loop: "{{ openchami_image_exists.results }}" + loop_control: + label: "{{ item.item }}" + register: pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pull_result.rc == 0 + changed_when: false + when: item.rc != 0 + +- name: Fail if any OpenCHAMI image pull failed + ansible.builtin.fail: + msg: "Failed to pull OpenCHAMI image: {{ item.item }}. Error: {{ item.stderr }}" + loop: "{{ pull_result.results | default([]) }}" + when: item.rc is defined and item.rc != 0 diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 6d0848e0af..2d7db2ca85 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -36,5 +36,41 @@ data_oci_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/oci" data_s3_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/s3" s3_work_dir: "{{ oim_shared_path }}/omnia/openchami/s3" +# Usage: deploy_openchami.yml - pull openchami images +pull_image_retries: 5 +pull_image_delay: 10 + +# OpenCHAMI image tags +openchami_local_ca_tag: "v0.2.2" +openchami_opaal_tag: "v0.3.10" +openchami_smd_tag: "v2.18.0" +openchami_bss_tag: "v1.32.0" +openchami_cloud_init_tag: "v1.2.3" +openchami_coredhcp_tag: "v0.3.0" +# Third-party image tags for OpenCHAMI +minio_tag: "latest" +postgres_tag: "11.5-alpine" +hydra_tag: "v2.3" +haproxy_tag: "latest" +registry_tag: "latest" +curl_tag: "latest" +acme_tag: "3.1.1" + +# OpenCHAMI images list for podman pull on OIM +openchami_images: + - "ghcr.io/openchami/local-ca:{{ openchami_local_ca_tag }}" + - "ghcr.io/openchami/opaal:{{ openchami_opaal_tag }}" + - "ghcr.io/openchami/smd:{{ openchami_smd_tag }}" + - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}" + - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}" + - "ghcr.io/openchami/coredhcp:{{ openchami_coredhcp_tag }}" + - "docker.io/minio/minio:{{ minio_tag }}" + - "docker.io/library/postgres:{{ postgres_tag }}" + - "docker.io/oryd/hydra:{{ hydra_tag }}" + - "cgr.dev/chainguard/haproxy:{{ haproxy_tag }}" + - "docker.io/library/registry:{{ registry_tag }}" + - "cgr.dev/chainguard/curl:{{ curl_tag }}" + - "docker.io/neilpang/acme.sh:{{ acme_tag }}" + # Usage: verify_openchami.yml cluster_env_key: "{{ oim_node_name | upper }}_ACCESS_TOKEN" diff --git a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml index 4ae77823a0..dc143b03c5 100644 --- a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml +++ b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml @@ -38,6 +38,31 @@ when: hostname_enabled no_log: true +- name: Check if Pulp image already exists + ansible.builtin.command: + cmd: "podman image exists {{ pulp_image }}" + register: pulp_image_exists + changed_when: false + failed_when: false + +- name: Pull Pulp image using Podman when missing + ansible.builtin.command: + cmd: "podman pull {{ pulp_image }}" + register: pulp_pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pulp_pull_result is not failed + changed_when: false + when: pulp_image_exists.rc != 0 + +- name: Fail if Pulp image pull failed + ansible.builtin.fail: + msg: "Failed to pull Pulp image: {{ pulp_image }}. Error: {{ pulp_pull_result.stderr }}" + when: + - pulp_image_exists.rc != 0 + - pulp_pull_result.rc is defined + - pulp_pull_result.rc != 0 + - name: Invoke Pulp Container Deployment Tasks for HTTP ansible.builtin.include_tasks: deploy_pulp_container_http.yml when: not pulp_protocol_https diff --git a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml index 5613c13055..26dbec2dae 100644 --- a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml @@ -27,6 +27,10 @@ pulp_protocol_https: true # Tag is fixed for the Pulp container image as of 10-06-2025 pulp_image: "docker.io/pulp/pulp:3.80" +# Usage: deployment_prereq.yml - pull image retries +pull_image_retries: 5 +pull_image_delay: 10 + arg_list: - "-e PULP_WORKERS=10" - "-e PULP_API_WORKERS=10" diff --git a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml index 8615897205..7078a2f056 100644 --- a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml +++ b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -87,6 +87,9 @@ ansible.builtin.debug: msg: "Filtered BMC IPs: {{ filtered_bmc_ip_list }}" +- name: Remove deleted nodes from telemetry (nodes not in bmc_data.csv) + ansible.builtin.include_tasks: remove_deleted_nodes.yml + - name: Convert filtered_bmc_ip_list to a dictionary with bmc_ip ansible.builtin.set_fact: filtered_bmc_ip_dict_list: "{{ filtered_bmc_ip_list | map('community.general.dict_kv', 'bmc_ip') | list }}" diff --git a/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml new file mode 100644 index 0000000000..4c82abf9e1 --- /dev/null +++ b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml @@ -0,0 +1,101 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Identify iDRAC IPs to remove (present in DB but not in bmc_data.csv) + ansible.builtin.set_fact: + ips_to_remove: "{{ db_idrac_ip_list | difference(bmc_ips) }}" + +- name: Show iDRAC IPs to be removed + ansible.builtin.debug: + msg: "iDRAC IPs to be removed: {{ ips_to_remove }}" + when: ips_to_remove | length > 0 + +- name: Skip removal if no IPs to remove + ansible.builtin.debug: + msg: "{{ no_idracips_to_remove_msg }}" + when: ips_to_remove | length == 0 + +- name: Disable telemetry on iDRAC nodes before removal + when: ips_to_remove | length > 0 + block: + - name: Disable telemetry service on iDRAC nodes + disable_idrac_telemetry: + idrac_ips: "{{ ips_to_remove }}" + username: "{{ hostvars['localhost']['bmc_username'] }}" + password: "{{ hostvars['localhost']['bmc_password'] }}" + timeout: "{{ redfish_timeout }}" + register: disable_telemetry_result + ignore_errors: true + + - name: Show successfully disabled telemetry IPs + ansible.builtin.debug: + msg: "Successfully disabled telemetry on: {{ disable_telemetry_result.disabled_ips | default([]) }}" + when: + - disable_telemetry_result.disabled_ips is defined + - disable_telemetry_result.disabled_ips | length > 0 + + - name: Show failed to disable telemetry IPs + ansible.builtin.debug: + msg: "Failed to disable telemetry on: {{ disable_telemetry_result.failed_ips | default([]) }}" + when: + - disable_telemetry_result.failed_ips is defined + - disable_telemetry_result.failed_ips | length > 0 + +- name: Remove iDRAC IPs from MySQL database + when: ips_to_remove | length > 0 + block: + - name: Delete iDRAC IPs from mysqldb + delete_idracips_from_mysqldb: + telemetry_namespace: "{{ telemetry_namespace }}" + idrac_podnames: "{{ idrac_podname_idracips.idrac_podname_ips.keys() | list }}" + mysqldb_k8s_name: "{{ mysqldb_k8s_name }}" + mysqldb_name: "{{ mysqldb_name }}" + mysqldb_user: "{{ hostvars['localhost']['mysqldb_user'] }}" + mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] }}" + ips_to_delete: "{{ ips_to_remove }}" + pod_to_db_idrac_ips: "{{ existing_pod_to_db_idrac_ips }}" + db_retries: "{{ db_retries }}" + db_delay: "{{ db_delay }}" + register: delete_idrac_result + rescue: + - name: Failed to delete iDRAC IPs from mysqldb + ansible.builtin.fail: + msg: "{{ mysqldb_delete_fail_msg }}" + +- name: Show deleted iDRAC IPs + ansible.builtin.debug: + msg: "Successfully deleted iDRAC IPs from mysqldb: {{ delete_idrac_result.deleted_ips | default([]) }}" + when: + - ips_to_remove | length > 0 + - delete_idrac_result.deleted_ips is defined + - delete_idrac_result.deleted_ips | length > 0 + +- name: Show failed to delete iDRAC IPs + ansible.builtin.debug: + msg: "Failed to delete iDRAC IPs from mysqldb: {{ delete_idrac_result.failed_ips | default([]) }}" + when: + - ips_to_remove | length > 0 + - delete_idrac_result.failed_ips is defined + - delete_idrac_result.failed_ips | length > 0 + +- name: Update telemetry report variables with deletion info + ansible.builtin.set_fact: + deleted_idrac_count: "{{ delete_idrac_result.deleted_ips | default([]) | length }}" + deleted_idrac_ips: "{{ delete_idrac_result.deleted_ips | default([]) }}" + failed_delete_count: "{{ delete_idrac_result.failed_ips | default([]) | length }}" + failed_delete_ips: "{{ delete_idrac_result.failed_ips | default([]) }}" + disabled_telemetry_count: "{{ disable_telemetry_result.disabled_ips | default([]) | length }}" + disabled_telemetry_ips: "{{ disable_telemetry_result.disabled_ips | default([]) }}" + when: ips_to_remove | length > 0 diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 index 4d8554cab3..54986f418f 100644 --- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 +++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 @@ -2,9 +2,9 @@ ----- Telemetry Report for Cluster ----- -Total IP count with Telemetry activated: {{ (db_idrac_ip_list | length) + (telemetry_idrac | length) }} +Total IP count with Telemetry activated: {{ ((db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([]))) | length }} Telemetry activated IPs List: -{% for item in db_idrac_ip_list + telemetry_idrac %} +{% for item in (db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([])) %} - {{ item }} {% endfor %} @@ -14,5 +14,23 @@ Telemetry not supported IPs List: - {{ item }} {% endfor %} +{% if deleted_idrac_count is defined and deleted_idrac_count | int > 0 %} +----- Node Deletion Report ----- + +Total IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }} +Removed IPs List: +{% for item in deleted_idrac_ips %} + - {{ item }} +{% endfor %} + +{% if disabled_telemetry_count is defined and disabled_telemetry_count | int > 0 %} +IPs with telemetry disabled via Redfish: {{ disabled_telemetry_count | int }} +Disabled telemetry IPs List: +{% for item in disabled_telemetry_ips %} + - {{ item }} +{% endfor %} +{% endif %} +{% endif %} + ===== Telemetry Report End ===== diff --git a/telemetry/roles/idrac_telemetry/vars/main.yml b/telemetry/roles/idrac_telemetry/vars/main.yml index d2696f4ac8..7fe6730789 100644 --- a/telemetry/roles/idrac_telemetry/vars/main.yml +++ b/telemetry/roles/idrac_telemetry/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -67,6 +67,13 @@ idrac_telemetry_statefulset_restart_failure_msg: | Failed to restart the {{ idrac_telemetry_k8s_name }} StatefulSet. Please check the logs using the command kubectl logs -n {{ telemetry_namespace }} {{ idrac_telemetry_k8s_name }}- and try again. +# Usage: remove_deleted_nodes.yml +redfish_timeout: 30 +mysqldb_delete_fail_msg: | + Failed to delete iDRAC IPs from the mysql database. + This could be due to the tables in the mysqldb not being accessible at the moment. Please try running the playbook again after some time. +no_idracips_to_remove_msg: "No iDRAC IPs to remove. All DB entries are present in bmc_data.csv." + # Usage: create_telemetry_report.yml telemetry_report_path: "/opt/omnia/telemetry/idrac_telemetry_report.yml" telemetry_report_template: "telemetry_report.j2" @@ -75,6 +82,9 @@ telemetry_report: | IP count with Telemetry not supported: {{ failed_idrac_count | int + invalid_idrac_count | int }} IP count with Telemetry activated in current execution: {{ telemetry_idrac_count | int }} + {% if deleted_idrac_count is defined %} + IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }} + {% endif %} {% if (failed_idrac_count | int + invalid_idrac_count | int) > 0 %} Potential reasons for telemetry not being initiated include Redfish connectivity problems, timeout issues, @@ -105,3 +115,15 @@ telemetry_report: | - {{ item }} {% endfor %} {% endif %} + {% if deleted_idrac_ips is defined and deleted_idrac_ips | length > 0 %} + IPs removed from telemetry database (not present in bmc_data.csv): + {% for item in deleted_idrac_ips %} + - {{ item }} + {% endfor %} + {% endif %} + {% if disabled_telemetry_ips is defined and disabled_telemetry_ips | length > 0 %} + IPs with telemetry disabled via Redfish: + {% for item in disabled_telemetry_ips %} + - {{ item }} + {% endfor %} + {% endif %} diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml new file mode 100644 index 0000000000..444869291b --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -0,0 +1,49 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Display collected warnings + ansible.builtin.debug: + msg: | + ================================= + UPGRADE WARNINGS SUMMARY + ================================= + + {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected. + You will now be shown the detailed list. + when: + - upgrade_warnings is defined + - upgrade_warnings | length > 0 + + +- name: Pause for user to review warnings + ansible.builtin.pause: + seconds: 30 + prompt: | + ╔════════════════════════════════════════════╗ + ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ + ╚════════════════════════════════════════════╝ + + {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected: + + {% for warning in upgrade_warnings %} + {{ loop.index }}. {{ warning }} + {% endfor %} + + Please review these warnings carefully. + Press ENTER to continue or CTRL+C to abort. + Continuing automatically in 30 seconds... + when: + - upgrade_warnings is defined + - upgrade_warnings | length > 0 diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index f4c5b1b7cb..2aacba7451 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -12,3 +12,42 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +- name: Set backup location based on oim_metadata.yml + ansible.builtin.include_tasks: set_backup_location.yml + +- name: Validate backup location for upgrade input processing + ansible.builtin.include_tasks: precheck_backup_location.yml + +- name: Transform network_spec.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_network_spec.yml + +- name: Transform high_availability_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_high_availability_config.yml + +- name: Transform local_repo_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_local_repo_config.yml + +- name: Transform provision_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_provision_config.yml + +- name: Transform storage_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_storage_config.yml + +- name: Transform omnia_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_omnia_config.yml + +- name: Transform telemetry_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_telemetry_config.yml + +- name: Restore input files from backup + ansible.builtin.include_tasks: restore_input_files.yml + +- name: Restore user_registry_credential.yml from backup + ansible.builtin.include_tasks: restore_user_registry_credential.yml + +- name: Restore omnia_config_credentials.yml from backup + ansible.builtin.include_tasks: restore_omnia_config_credentials.yml + +- name: Display upgrade warnings summary + ansible.builtin.include_tasks: display_warnings.yml diff --git a/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml new file mode 100644 index 0000000000..fe058f83a9 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml @@ -0,0 +1,25 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate backup_location is provided + ansible.builtin.fail: + msg: "{{ msg_backup_location_missing }}" + when: backup_location is not defined or (backup_location | string | trim) == "" + +- name: Ensure backup directory exists + ansible.builtin.file: + path: "{{ backup_location }}" + state: directory + mode: "{{ backup_dir_mode }}" diff --git a/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml b/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml new file mode 100644 index 0000000000..3dd6d45206 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml @@ -0,0 +1,25 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate restore_input_files is defined + ansible.builtin.set_fact: + restore_input_files_effective: "{{ restore_input_files | default([]) }}" + +- name: Restore input files from backup (overwrite target) + ansible.builtin.include_tasks: restore_single_input_file.yml + loop: "{{ restore_input_files_effective }}" + loop_control: + loop_var: restore_item + when: (restore_input_files_effective | length) > 0 diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml new file mode 100644 index 0000000000..6a20f371f8 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -0,0 +1,177 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup omnia_config_credentials.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_stat + +- name: Check if backup omnia_config_credentials_key exists + ansible.builtin.stat: + path: "{{ backup_location }}/.omnia_config_credentials_key" + register: backup_omnia_config_credentials_key_stat + +- name: Add warning for missing omnia_config_credentials.yml to list + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }} + when: + - not backup_omnia_config_credentials_stat.stat.exists + - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))" + +- name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_content + changed_when: false + failed_when: false + no_log: true + when: backup_omnia_config_credentials_stat.stat.exists + +- name: Process omnia_config_credentials.yml when present in backup + when: >- + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout + block: + - name: "Case 1: Key present and file encrypted - Process and update" + block: + - name: Copy encrypted omnia_config_credentials.yml from backup to temp location + ansible.builtin.copy: + src: "{{ backup_location }}/omnia_config_credentials.yml" + dest: "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" + mode: '0600' + remote_src: true + + - name: Copy omnia_config_credentials_key from backup + ansible.builtin.copy: + src: "{{ backup_location }}/.omnia_config_credentials_key" + dest: "{{ input_project_dir }}/.omnia_config_credentials_key" + mode: '0600' + remote_src: true + + - name: Decrypt omnia_config_credentials.yml using the key + ansible.builtin.shell: + cmd: | + ansible-vault decrypt "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" \ + --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \ + --output "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + args: + executable: /bin/bash + no_log: true + register: vault_decrypt_result + failed_when: vault_decrypt_result.rc != 0 + changed_when: false + + - name: Read decrypted content + ansible.builtin.slurp: + src: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + register: decrypted_content + no_log: true + + - name: Parse YAML content and extract credentials + ansible.builtin.set_fact: + credentials_dict: >- + {{ decrypted_content.content | b64decode | from_yaml }} + no_log: true + + rescue: + - name: Fail with decryption error message + ansible.builtin.fail: + msg: "{{ msg_omnia_config_decrypt_error }}" + + - name: "Case 1.1: Apply template and encrypt" + when: > + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout + block: + - name: Set template variables from credentials + ansible.builtin.set_fact: + provision_password: "{{ credentials_dict.provision_password | default('') }}" + bmc_username: "{{ credentials_dict.bmc_username | default('') }}" + bmc_password: "{{ credentials_dict.bmc_password | default('') }}" + minio_s3_password: "{{ credentials_dict.minio_s3_password | default('') }}" + pulp_password: "{{ credentials_dict.pulp_password | default('') }}" + docker_username: "{{ credentials_dict.docker_username | default('') }}" + docker_password: "{{ credentials_dict.docker_password | default('') }}" + slurm_db_password: "{{ credentials_dict.slurm_db_password | default('') }}" + openldap_db_username: "{{ credentials_dict.openldap_db_username | default('') }}" + openldap_db_password: "{{ credentials_dict.openldap_db_password | default('') }}" + mysqldb_user: "{{ credentials_dict.mysqldb_user | default('') }}" + mysqldb_password: "{{ credentials_dict.mysqldb_password | default('') }}" + mysqldb_root_password: "{{ credentials_dict.mysqldb_root_password | default('') }}" + csi_username: "{{ credentials_dict.csi_username | default('') }}" + csi_password: "{{ credentials_dict.csi_password | default('') }}" + ldms_sampler_password: "{{ credentials_dict.ldms_sampler_password | default('') }}" + no_log: true + + - name: Write updated content using template + ansible.builtin.template: + src: omnia_config_credentials.yml.j2 + dest: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + mode: '0600' + no_log: true + + - name: Encrypt updated file using the same key + ansible.builtin.shell: + cmd: | + ansible-vault encrypt "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" \ + --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \ + --output "{{ input_project_dir }}/omnia_config_credentials.yml" + args: + executable: /bin/bash + no_log: true + register: vault_encrypt_result + failed_when: vault_encrypt_result.rc != 0 + changed_when: false + + - name: Clean up temporary files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" + - "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + + - name: Display success message + ansible.builtin.debug: + msg: "{{ msg_omnia_config_credentials_success }}" + + rescue: + - name: Fail with template/encryption error message + ansible.builtin.fail: + msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}" + + - name: "Case 2: Both key and file missing - Add info warning" + when: > + not backup_omnia_config_credentials_key_stat.stat.exists and + (backup_omnia_config_credentials_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and + "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))" + ansible.builtin.set_fact: + upgrade_warnings: > + {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }} + + - name: "Case 3: Error - Mismatched state" + when: > + (not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or + (backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) + ansible.builtin.fail: + msg: "{{ msg_omnia_config_credentials_error }}" diff --git a/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml b/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml new file mode 100644 index 0000000000..f55d14bd3e --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml @@ -0,0 +1,54 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate restore item fields + ansible.builtin.fail: + msg: "{{ msg_restore_item_name_missing }}" + when: restore_item.name is not defined or (restore_item.name | string | trim) == "" + +- name: Check if backup file exists + ansible.builtin.stat: + path: "{{ backup_location }}/{{ restore_item.name }}" + register: restore_backup_stat + +- name: Fail if backup file is not present + ansible.builtin.fail: + msg: "{{ msg_backup_file_missing }}" + when: not restore_backup_stat.stat.exists + +- name: Overwrite input file from backup + ansible.builtin.copy: + src: "{{ backup_location }}/{{ restore_item.name }}" + dest: "{{ input_project_dir }}/{{ restore_item.name }}" + mode: "{{ restore_item.mode | default(default_file_mode) }}" + remote_src: true + +- name: Validate restored file (optional) + ansible.builtin.command: + cmd: "{{ restore_item.validate_cmd }}" + register: restore_validate + changed_when: false + when: restore_item.validate_cmd is defined and (restore_item.validate_cmd | string | trim) != "" + +- name: Fail if restored file validation fails + ansible.builtin.fail: + msg: "{{ msg_validation_failed }}" + when: + - restore_item.validate_cmd is defined and (restore_item.validate_cmd | string | trim) != "" + - restore_validate.rc != 0 + +- name: Display restore summary + ansible.builtin.debug: + msg: "{{ msg_restore_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml new file mode 100644 index 0000000000..158b029ed3 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -0,0 +1,135 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup user_registry_credential.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_credential_stat + +- name: Check if user_registry_credential.yml exists in current directory + ansible.builtin.stat: + path: "{{ input_project_dir }}/user_registry_credential.yml" + register: user_registry_credential_stat + +- name: Check if backup local_repo_credentials_key exists + ansible.builtin.stat: + path: "{{ backup_location }}/.local_repo_credentials_key" + register: backup_local_repo_credentials_key_stat + +- name: Add warning for missing user_registry_credential.yml to list + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [msg_user_registry_credential_missing] }} + when: + - not backup_user_registry_credential_stat.stat.exists + - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))" + +- name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_content + changed_when: false + failed_when: false + no_log: true + when: backup_user_registry_credential_stat.stat.exists + +- name: Process user_registry_credential.yml when present in backup + when: >- + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout + block: + + - name: "Case 1: Key present and file encrypted - Copy both" + when: > + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout + block: + - name: Decrypt user_registry_credential.yml using the key + ansible.builtin.shell: + cmd: | + ansible-vault decrypt "{{ input_project_dir }}/user_registry_credential.yml.tmp" \ + --vault-password-file "{{ input_project_dir }}/.local_repo_credentials_key" \ + --output "{{ input_project_dir }}/user_registry_credential.yml.decrypted" + args: + executable: /bin/bash + no_log: true + register: vault_decrypt_result + failed_when: vault_decrypt_result.rc != 0 + changed_when: false + + - name: Copy encrypted user_registry_credential.yml from backup + ansible.builtin.copy: + src: "{{ backup_location }}/user_registry_credential.yml" + dest: "{{ input_project_dir }}/user_registry_credential.yml" + mode: '0600' + remote_src: true + + - name: Copy local_repo_credentials_key from backup + ansible.builtin.copy: + src: "{{ backup_location }}/.local_repo_credentials_key" + dest: "{{ input_project_dir }}/.local_repo_credentials_key" + mode: '0600' + remote_src: true + + - name: Display success message for encrypted file restoration + ansible.builtin.debug: + msg: | + user_registry_credential.yml restored from backup. + Backup: {{ backup_location }}/user_registry_credential.yml + Target: {{ input_project_dir }}/user_registry_credential.yml + Status: Encrypted (key file also restored) + rescue: + - name: Fail with decryption error message + ansible.builtin.fail: + msg: "{{ msg_user_registry_decrypt_error }}" + + - name: "Case 2: Both key and file missing - Add info warning" + when: >- + not backup_local_repo_credentials_key_stat.stat.exists and + (backup_user_registry_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and + "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))" + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [ + "INFO: Both user_registry_credential.yml and .local_repo_credentials_key " + + "are not present in backup. This is expected if registry credentials " + + "were not configured in the source installation." + ] }} + + - name: "Case 3: Error - Mismatched state" + when: >- + (not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or + (backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) + ansible.builtin.fail: + msg: | + ERROR: Inconsistent state detected for user_registry_credential.yml: + {% if not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %} + - File is encrypted but key file (.local_repo_credentials_key) is missing + {% elif backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} + - Key file exists but file is not encrypted + {% endif %} + Please check the backup integrity and ensure both files are present + in consistent states. diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml new file mode 100644 index 0000000000..94156606e5 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml @@ -0,0 +1,33 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Read oim_metadata.yml to get upgrade_backup_dir + ansible.builtin.slurp: + src: "{{ oim_metadata_path }}" + register: oim_metadata_slurp + +- name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" + +- name: Set backup_location from metadata + ansible.builtin.set_fact: + backup_location: "{{ oim_metadata.upgrade_backup_dir }}/input/project_default" + when: oim_metadata.upgrade_backup_dir is defined + +- name: Fail if upgrade_backup_dir is not defined in metadata + ansible.builtin.fail: + msg: "{{ msg_upgrade_backup_dir_missing }}" + when: oim_metadata.upgrade_backup_dir is not defined diff --git a/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml new file mode 100644 index 0000000000..494dfda41a --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml @@ -0,0 +1,114 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup high_availability_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/high_availability_config.yml" + register: backup_ha_config_stat + +- name: Fail if backup high_availability_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_ha_config_missing }}" + when: not backup_ha_config_stat.stat.exists + +- name: Check if high_availability_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/high_availability_config.yml" + register: ha_config_stat + +- name: Fail if high_availability_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_ha_config_missing }}" + when: not ha_config_stat.stat.exists + +- name: Read backup high_availability_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/high_availability_config.yml" + register: backup_ha_config_slurp + +- name: Parse backup high_availability_config.yml + ansible.builtin.set_fact: + backup_ha_config: "{{ backup_ha_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize service_k8s_cluster_ha to a list + ansible.builtin.set_fact: + ha_service_k8s_cluster_ha: >- + {{ + ( + [backup_ha_config.service_k8s_cluster_ha] + if (backup_ha_config.service_k8s_cluster_ha is mapping) + else (backup_ha_config.service_k8s_cluster_ha | default([])) + ) + }} + +- name: Collect HA entries missing virtual_ip_address + ansible.builtin.set_fact: + ha_entries_missing_vip: >- + {{ + (ha_service_k8s_cluster_ha | default([])) + | select('mapping') + | selectattr('virtual_ip_address', 'undefined') + | map(attribute='cluster_name') + | list + }} + +- name: Collect HA entries with empty virtual_ip_address + ansible.builtin.set_fact: + ha_entries_empty_vip: >- + {{ + (ha_service_k8s_cluster_ha | default([])) + | select('mapping') + | selectattr('virtual_ip_address', 'defined') + | selectattr('virtual_ip_address', 'match', '^\\s*$') + | map(attribute='cluster_name') + | list + }} + +- name: Fail if virtual_ip_address is missing + ansible.builtin.fail: + msg: "{{ msg_ha_virtual_ip_missing }}" + when: + - (ha_service_k8s_cluster_ha | default([]) | length) == 0 + or ((ha_entries_missing_vip | default([]) | length) > 0) + or ((ha_entries_empty_vip | default([]) | length) > 0) + +- name: Write high_availability_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: high_availability_config.j2 + dest: "{{ input_project_dir }}/high_availability_config.yml" + mode: "{{ default_file_mode }}" + vars: + ha_service_k8s_cluster_ha: "{{ ha_service_k8s_cluster_ha }}" + +- name: Validate YAML syntax of transformed high_availability_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/high_availability_config.yml','r'))" + register: ha_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - ha_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_ha_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_ha_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml new file mode 100644 index 0000000000..4b8ac8e3ec --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml @@ -0,0 +1,141 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup local_repo_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/local_repo_config.yml" + register: backup_local_repo_config_stat + +- name: Fail if backup local_repo_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_local_repo_config_missing }}" + when: not backup_local_repo_config_stat.stat.exists + +- name: Check if local_repo_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/local_repo_config.yml" + register: local_repo_config_stat + +- name: Fail if local_repo_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_local_repo_config_missing }}" + when: not local_repo_config_stat.stat.exists + +- name: Read backup local_repo_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/local_repo_config.yml" + register: backup_local_repo_config_slurp + +- name: Parse backup local_repo_config.yml + ansible.builtin.set_fact: + backup_local_repo_config: "{{ backup_local_repo_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize user_registry + ansible.builtin.set_fact: + local_repo_user_registry: >- + {{ + ( + backup_local_repo_config.user_registry + if (backup_local_repo_config.user_registry is defined) + else + ( + ( + (backup_local_repo_config.omnia_registry | default([])) + | select('string') + | map('regex_replace', '^(.*)$', '{"host": "\\1", "cert_path": "", "key_path": ""}') + | map('from_json') + | list + ) + ) + ) + }} + +- name: Normalize repo url keys to 2.1 schema + ansible.builtin.set_fact: + local_repo_user_repo_url_x86_64: "{{ + backup_local_repo_config.user_repo_url_x86_64 | + default(backup_local_repo_config.user_repo | + default([])) + }}" + local_repo_user_repo_url_aarch64: "{{ backup_local_repo_config.user_repo_url_aarch64 | default([]) }}" + local_repo_rhel_os_url_x86_64: "{{ + backup_local_repo_config.rhel_os_url_x86_64 | + default(backup_local_repo_config.rhel_os_url | + default([])) + }}" + local_repo_rhel_os_url_aarch64: "{{ backup_local_repo_config.rhel_os_url_aarch64 | default([]) }}" + local_repo_omnia_repo_url_rhel_x86_64: "{{ + backup_local_repo_config.omnia_repo_url_rhel_x86_64 | + default(backup_local_repo_config.omnia_repo_url_rhel | + default([])) + }}" + local_repo_omnia_repo_url_rhel_aarch64: "{{ + backup_local_repo_config.omnia_repo_url_rhel_aarch64 | + default(backup_local_repo_config.omnia_repo_url_rhel | + default([])) + }}" + local_repo_additional_repos_x86_64: "{{ + backup_local_repo_config.additional_repos_x86_64 | + default(backup_local_repo_config.additional_repos | + default([])) + }}" + local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}" + +- name: Fail if omnia_repo_url_rhel_x86_64 is missing + ansible.builtin.fail: + msg: "{{ msg_omnia_repo_url_rhel_x86_64_missing }}" + when: (local_repo_omnia_repo_url_rhel_x86_64 | default([]) | length) == 0 + +- name: Fail if omnia_repo_url_rhel_aarch64 is missing + ansible.builtin.fail: + msg: "{{ msg_omnia_repo_url_rhel_aarch64_missing }}" + when: (local_repo_omnia_repo_url_rhel_aarch64 | default([]) | length) == 0 + +- name: Write local_repo_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: local_repo_config.j2 + dest: "{{ input_project_dir }}/local_repo_config.yml" + mode: "{{ default_file_mode }}" + vars: + local_repo_user_registry: "{{ local_repo_user_registry }}" + local_repo_user_repo_url_x86_64: "{{ local_repo_user_repo_url_x86_64 }}" + local_repo_user_repo_url_aarch64: "{{ local_repo_user_repo_url_aarch64 }}" + local_repo_rhel_os_url_x86_64: "{{ local_repo_rhel_os_url_x86_64 }}" + local_repo_rhel_os_url_aarch64: "{{ local_repo_rhel_os_url_aarch64 }}" + local_repo_omnia_repo_url_rhel_x86_64: "{{ local_repo_omnia_repo_url_rhel_x86_64 }}" + local_repo_omnia_repo_url_rhel_aarch64: "{{ local_repo_omnia_repo_url_rhel_aarch64 }}" + local_repo_additional_repos_x86_64: "{{ local_repo_additional_repos_x86_64 }}" + local_repo_additional_repos_aarch64: "{{ local_repo_additional_repos_aarch64 }}" + +- name: Validate YAML syntax of transformed local_repo_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/local_repo_config.yml','r'))" + register: local_repo_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - local_repo_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_local_repo_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_local_repo_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml new file mode 100644 index 0000000000..17e742d22f --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml @@ -0,0 +1,157 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup network_spec.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/network_spec.yml" + register: backup_network_spec_stat + +- name: Fail if backup network_spec.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_network_spec_missing }}" + when: not backup_network_spec_stat.stat.exists + +- name: Check if network_spec.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/network_spec.yml" + register: network_spec_stat + +- name: Fail if network_spec.yml is not present + ansible.builtin.fail: + msg: "{{ msg_network_spec_missing }}" + when: not network_spec_stat.stat.exists + +- name: Read backup network_spec.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/network_spec.yml" + register: backup_network_spec_slurp + +- name: Parse backup network_spec.yml + ansible.builtin.set_fact: + backup_network_spec: "{{ backup_network_spec_slurp.content | b64decode | from_yaml }}" + +- name: Extract admin_network and ib_network from backup file + ansible.builtin.set_fact: + admin_network: >- + {{ + (backup_network_spec.admin_network + if (backup_network_spec is mapping and backup_network_spec.admin_network is defined) + else + ( + (backup_network_spec.Networks | default([]) + | select('mapping') + | selectattr('admin_network', 'defined') + | map(attribute='admin_network') + | first + ) | default({}) + ) + ) + }} + ib_network: >- + {{ + (backup_network_spec.ib_network + if (backup_network_spec is mapping and backup_network_spec.ib_network is defined) + else + ( + (backup_network_spec.Networks | default([]) + | select('mapping') + | selectattr('ib_network', 'defined') + | map(attribute='ib_network') + | first + ) | default({}) + ) + ) + }} + when: + - true + +- name: Render network_spec.yml in Omnia 2.1 format + ansible.builtin.template: + src: network_spec.j2 + dest: "{{ input_project_dir }}/network_spec.yml" + mode: "{{ default_file_mode }}" + vars: + admin_network_netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" + when: true + +- name: Read transformed network_spec.yml + ansible.builtin.slurp: + src: "{{ input_project_dir }}/network_spec.yml" + register: network_spec_21_slurp + when: true + +- name: Parse transformed network_spec.yml + ansible.builtin.set_fact: + network_spec_21: "{{ network_spec_21_slurp.content | b64decode | from_yaml }}" + when: true + +- name: Validate YAML syntax of transformed network_spec.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/network_spec.yml','r'))" + register: network_spec_yaml_validation + changed_when: false + when: true + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - network_spec_yaml_validation.rc != 0 + +- name: Ensure ib_network.netmask_bits matches admin_network.netmask_bits + ansible.builtin.fail: + msg: "{{ msg_ib_netmask_mismatch }}" + when: + - >- + (ib_network.netmask_bits | default(admin_network.netmask_bits | default('24')) | string) + != (admin_network.netmask_bits | default('24') | string) + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_network_spec }}" + when: true + +- name: Validate mandatory ib_network is present in transformed output + ansible.builtin.fail: + msg: "{{ msg_ib_network_missing }}" + when: + - >- + (network_spec_21.Networks is not defined) + or ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) == 0) + +- name: Extract ib_network subnet from transformed output + ansible.builtin.set_fact: + ib_network_subnet: >- + {{ + ( + network_spec_21.Networks + | select('mapping') + | selectattr('ib_network', 'defined') + | map(attribute='ib_network') + | first + | default({}) + ).subnet | default('') + }} + +- name: Validate mandatory ib_network.subnet is present in transformed output + ansible.builtin.fail: + msg: "{{ msg_ib_subnet_missing }}" + when: + - >- + (ib_network_subnet | string | trim) == '' + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_network_spec_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml new file mode 100644 index 0000000000..ab62c3ff28 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml @@ -0,0 +1,103 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup omnia_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/omnia_config.yml" + register: backup_omnia_config_stat + +- name: Fail if backup omnia_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_omnia_config_missing }}" + when: not backup_omnia_config_stat.stat.exists + +- name: Check if omnia_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/omnia_config.yml" + register: omnia_config_stat + +- name: Fail if omnia_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_omnia_config_missing }}" + when: not omnia_config_stat.stat.exists + +- name: Read backup omnia_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/omnia_config.yml" + register: backup_omnia_config_slurp + +- name: Parse backup omnia_config.yml + ansible.builtin.set_fact: + backup_omnia_config: "{{ backup_omnia_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize omnia_config.yml values + ansible.builtin.set_fact: + omnia_slurm_cluster_raw: "{{ backup_omnia_config.slurm_cluster | default([]) }}" + omnia_service_k8s_cluster_raw: "{{ backup_omnia_config.service_k8s_cluster | default([]) }}" + +- name: Ensure slurm_cluster and service_k8s_cluster are lists + ansible.builtin.set_fact: + omnia_slurm_cluster: >- + {{ + [omnia_slurm_cluster_raw] + if (omnia_slurm_cluster_raw is mapping) + else omnia_slurm_cluster_raw + }} + omnia_service_k8s_cluster: >- + {{ + [omnia_service_k8s_cluster_raw] + if (omnia_service_k8s_cluster_raw is mapping) + else omnia_service_k8s_cluster_raw + }} + +- name: Fail if slurm_cluster is missing + ansible.builtin.fail: + msg: "{{ msg_slurm_cluster_missing }}" + when: (omnia_slurm_cluster | default([]) | length) == 0 + +- name: Fail if service_k8s_cluster is missing + ansible.builtin.fail: + msg: "{{ msg_service_k8s_cluster_missing }}" + when: (omnia_service_k8s_cluster | default([]) | length) == 0 + +- name: Write omnia_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: omnia_config.j2 + dest: "{{ input_project_dir }}/omnia_config.yml" + mode: "{{ default_file_mode }}" + vars: + omnia_slurm_cluster: "{{ omnia_slurm_cluster }}" + omnia_service_k8s_cluster: "{{ omnia_service_k8s_cluster }}" + +- name: Validate YAML syntax of transformed omnia_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/omnia_config.yml','r'))" + register: omnia_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - omnia_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_omnia_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_omnia_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml new file mode 100644 index 0000000000..42598d59bc --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml @@ -0,0 +1,85 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup provision_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/provision_config.yml" + register: backup_provision_config_stat + +- name: Fail if backup provision_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_provision_config_missing }}" + when: not backup_provision_config_stat.stat.exists + +- name: Check if provision_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/provision_config.yml" + register: provision_config_stat + +- name: Fail if provision_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_provision_config_missing }}" + when: not provision_config_stat.stat.exists + +- name: Read backup provision_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/provision_config.yml" + register: backup_provision_config_slurp + +- name: Parse backup provision_config.yml + ansible.builtin.set_fact: + backup_provision_config: "{{ backup_provision_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize provision_config.yml values + ansible.builtin.set_fact: + provision_pxe_mapping_file_path: "{{ backup_provision_config.pxe_mapping_file_path | default('pxe_mapping_file.csv') }}" + provision_language: "{{ backup_provision_config.language | default('en_US.UTF-8') }}" + provision_default_lease_time: "{{ backup_provision_config.default_lease_time | default('86400') }}" + +- name: Fail if pxe_mapping_file_path is missing + ansible.builtin.fail: + msg: "{{ msg_pxe_mapping_file_path_missing }}" + when: (provision_pxe_mapping_file_path | string | trim) == '' + +- name: Write provision_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: provision_config.j2 + dest: "{{ input_project_dir }}/provision_config.yml" + mode: "{{ default_file_mode }}" + vars: + provision_pxe_mapping_file_path: "{{ provision_pxe_mapping_file_path }}" + provision_language: "{{ provision_language }}" + provision_default_lease_time: "{{ provision_default_lease_time }}" + +- name: Validate YAML syntax of transformed provision_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/provision_config.yml','r'))" + register: provision_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - provision_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_provision_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_provision_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml new file mode 100644 index 0000000000..8a167df6fb --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml @@ -0,0 +1,90 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup storage_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/storage_config.yml" + register: backup_storage_config_stat + +- name: Fail if backup storage_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_storage_config_missing }}" + when: not backup_storage_config_stat.stat.exists + +- name: Check if storage_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/storage_config.yml" + register: storage_config_stat + +- name: Fail if storage_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_storage_config_missing }}" + when: not storage_config_stat.stat.exists + +- name: Read backup storage_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/storage_config.yml" + register: backup_storage_config_slurp + +- name: Parse backup storage_config.yml + ansible.builtin.set_fact: + backup_storage_config: "{{ backup_storage_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize storage_config.yml values + ansible.builtin.set_fact: + storage_nfs_client_params: "{{ backup_storage_config.nfs_client_params | default([]) }}" + +- name: Fail if nfs_client_params is missing + ansible.builtin.fail: + msg: "{{ msg_nfs_client_params_missing }}" + when: (storage_nfs_client_params | default([]) | length) == 0 + +- name: Fail if any NFS client entry is missing required keys + ansible.builtin.fail: + msg: "{{ msg_nfs_client_param_entry_missing_keys }}" + when: >- + (storage_nfs_client_params | selectattr('server_ip', 'undefined') | list | length) > 0 or + (storage_nfs_client_params | selectattr('server_share_path', 'undefined') | list | length) > 0 or + (storage_nfs_client_params | selectattr('client_share_path', 'undefined') | list | length) > 0 or + (storage_nfs_client_params | selectattr('client_mount_options', 'undefined') | list | length) > 0 + +- name: Write storage_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: storage_config.j2 + dest: "{{ input_project_dir }}/storage_config.yml" + mode: "{{ default_file_mode }}" + vars: + storage_nfs_client_params: "{{ storage_nfs_client_params }}" + +- name: Validate YAML syntax of transformed storage_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/storage_config.yml','r'))" + register: storage_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - storage_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_storage_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_storage_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml new file mode 100644 index 0000000000..1aa095e66b --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -0,0 +1,148 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup telemetry_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/telemetry_config.yml" + register: backup_telemetry_config_stat + +- name: Fail if backup telemetry_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_telemetry_config_missing }}" + when: not backup_telemetry_config_stat.stat.exists + +- name: Check if telemetry_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/telemetry_config.yml" + register: telemetry_config_stat + +- name: Fail if telemetry_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_telemetry_config_missing }}" + when: not telemetry_config_stat.stat.exists + +- name: Read backup telemetry_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/telemetry_config.yml" + register: backup_telemetry_config_slurp + +- name: Parse backup telemetry_config.yml + ansible.builtin.set_fact: + backup_telemetry_config: "{{ backup_telemetry_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize nested backup telemetry sections + ansible.builtin.set_fact: + backup_telemetry_victoria_config: "{{ backup_telemetry_config.victoria_configurations | default({}) }}" + backup_telemetry_kafka_config: "{{ backup_telemetry_config.kafka_configurations | default({}) }}" + +- name: Normalize telemetry_config.yml values + ansible.builtin.set_fact: + telemetry_idrac_telemetry_support: "{{ backup_telemetry_config.idrac_telemetry_support | default(true) }}" + telemetry_idrac_telemetry_collection_type: >- + {{ + backup_telemetry_config.idrac_telemetry_collection_type + | default('victoria,kafka') + }} + telemetry_victoria_deployment_mode: "{{ backup_telemetry_victoria_config.deployment_mode | default('cluster') }}" + telemetry_victoria_persistence_size: "{{ backup_telemetry_victoria_config.persistence_size | default('8Gi') }}" + telemetry_victoria_retention_period: "{{ backup_telemetry_victoria_config.retention_period | default(168) }}" + telemetry_kafka_persistence_size: "{{ backup_telemetry_kafka_config.persistence_size | default('8Gi') }}" + telemetry_kafka_log_retention_hours: "{{ backup_telemetry_kafka_config.log_retention_hours | default(168) }}" + telemetry_kafka_log_retention_bytes: "{{ backup_telemetry_kafka_config.log_retention_bytes | default(-1) }}" + telemetry_kafka_log_segment_bytes: "{{ backup_telemetry_kafka_config.log_segment_bytes | default(1073741824) }}" + telemetry_kafka_topic_partitions: >- + {{ + backup_telemetry_kafka_config.topic_partitions + | default([ + {'name': 'idrac', 'partitions': 1}, + {'name': 'ldms', 'partitions': 2} + ]) + }} + telemetry_ldms_agg_port: "{{ backup_telemetry_config.ldms_agg_port | default(6001) }}" + telemetry_ldms_store_port: "{{ backup_telemetry_config.ldms_store_port | default(6001) }}" + telemetry_ldms_sampler_port: "{{ backup_telemetry_config.ldms_sampler_port | default(10001) }}" + telemetry_ldms_sampler_configurations: >- + {{ + backup_telemetry_config.ldms_sampler_configurations + | default([ + { + 'plugin_name': 'meminfo', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'procstat2', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'vmstat', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'loadavg', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'procnetdev2', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000 offset=0' + } + ]) + }} + +- name: Write telemetry_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: telemetry_config.j2 + dest: "{{ input_project_dir }}/telemetry_config.yml" + mode: "{{ default_file_mode }}" + vars: + telemetry_idrac_telemetry_support: "{{ telemetry_idrac_telemetry_support }}" + telemetry_idrac_telemetry_collection_type: "{{ telemetry_idrac_telemetry_collection_type }}" + telemetry_victoria_deployment_mode: "{{ telemetry_victoria_deployment_mode }}" + telemetry_victoria_persistence_size: "{{ telemetry_victoria_persistence_size }}" + telemetry_victoria_retention_period: "{{ telemetry_victoria_retention_period }}" + telemetry_kafka_persistence_size: "{{ telemetry_kafka_persistence_size }}" + telemetry_kafka_log_retention_hours: "{{ telemetry_kafka_log_retention_hours }}" + telemetry_kafka_log_retention_bytes: "{{ telemetry_kafka_log_retention_bytes }}" + telemetry_kafka_log_segment_bytes: "{{ telemetry_kafka_log_segment_bytes }}" + telemetry_kafka_topic_partitions: "{{ telemetry_kafka_topic_partitions }}" + telemetry_ldms_agg_port: "{{ telemetry_ldms_agg_port }}" + telemetry_ldms_store_port: "{{ telemetry_ldms_store_port }}" + telemetry_ldms_sampler_port: "{{ telemetry_ldms_sampler_port }}" + telemetry_ldms_sampler_configurations: "{{ telemetry_ldms_sampler_configurations }}" + +- name: Validate YAML syntax of transformed telemetry_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/telemetry_config.yml','r'))" + register: telemetry_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - telemetry_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_telemetry_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_telemetry_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/templates/high_availability_config.j2 b/upgrade/roles/import_input_parameters/templates/high_availability_config.j2 new file mode 100644 index 0000000000..b116d962fe --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/high_availability_config.j2 @@ -0,0 +1,27 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# *********************************************************************** +# High Availability (HA) Configuration for Kubernetes (K8s) Service Node(List) +# - cluster_name is required field it should match one of the values defined in omnia_config.yml where deployment is set to true. +# - enable_k8s_ha: Indicates whether to enable HA for the Kubernetes (K8s) service node. Set to 'true' to enable, 'false' to disable. +# - virtual_ip_address: The virtual IP address for the K8s service node setup. +# *********************************************************************** + +{{ {'service_k8s_cluster_ha': ha_service_k8s_cluster_ha} | to_nice_yaml(indent=2) }} diff --git a/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 new file mode 100644 index 0000000000..dbe38d70ad --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 @@ -0,0 +1,199 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# ================================ +# VARIABLE DETAILS +# ================================ +# 1. user_registry +#-------------------------- +# Configuration for user registry to configure additional images in Pulp +# Fields: +# host : Registry IP and port in format "IP:port" +# cert_path : Path to SSL certificate file (.crt) - Required only if host is using HTTPS +# key_path : Path to SSL private key file (.key) - Required only if host is using HTTPS +# Notes: +# - If host is HTTPS, cert_path and key_path are required +# - If host is HTTP, cert_path and key_path can be left empty +# - cert_path should point to .crt files only +# - key_path should point to .key files only +# - cert and key paths are accessed from within the omnia_core container +# 2. user_repo_url_x86_64 +#-------------------------- +# Optional list of user-defined repository URLs for x86_64 architecture. +# Each entry can include: url, gpgkey, sslcacert, sslclientkey, sslclientcert, name, policy. +# Used for custom cluster packages like _slurm_custom. +# Fields: +# url : Base URL of the repository +# gpgkey : GPG key URL (leave empty to disable gpgcheck; Omnia will trust this repo and user is responsible for its security) +# name : Name of the repository +# sslcacert : Path to SSL CA certificate (if using SSL) +# sslclientkey: Path to SSL client key (if using SSL) +# sslclientcert: Path to SSL client certificate (if using SSL) +# policy : Repository policy (always, partial) +# Notes: +# - Do not use Jinja variables in this configuration. +# - Omit SSL fields entirely if SSL is not in use. +# - Its a madatory field in case of slurm_custom with name as '_slurm_custom' +# +# 3. user_repo_url_aarch64 +#--------------------------- +# Same as above but for aarch64 architecture. +# +# 4. rhel_os_url_x86_64 +#----------------------------- +# Mandatory when RHEL subscription is not registered. +# Contains repository URLs for codeready-builder, baseos, and appstream for x86_64. +# Fields: +# url : Base URL of the repository +# gpgkey : GPG key URL (leave empty to disable gpgcheck; Omnia will trust this repo and user is responsible for its security) +# sslcacert : Path to SSL CA certificate (if using SSL) +# sslclientkey: Path to SSL client key (if using SSL) +# sslclientcert: Path to SSL client certificate (if using SSL) +# policy : Repository policy if mentioned allowed values (always, partial). IF not mentioned will consider from software_config.json +# name : Name of the repository [ Allowed repo names _codeready-builder, _appstream, _baseos +# Notes: +# - Do not use Jinja variables in this configuration. +# - Omit SSL fields entirely if SSL is not in use. +# - RHEL subscription is not registered, All 3 repositories [ _codeready-builder, _appstream, _baseos ]entries +# are mandatory. +# +# 5. rhel_os_url_aarch64 +#---------------------------- +# Same as above but for aarch64 architecture. +# +#### ADVANCE CONFIGURATIONS FOR LOCAL REPO ### +# 6. omnia_repo_url_rhel_x86_64 +#------------------------------- +# Mandatory repository URLs for downloading RPMS for Omnia features on RHEL x86_64. +# Each entry includes url, gpgkey, and name. +# +# This variable defines all the repo urls from where rpms will be downloaded for omnia features when cluster_os_type is rhel and arch x86_64 +# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# Fields: +# url : Base URL of the repository. +# gpgkey : URL of the GPG key for the repository. +# If left empty, gpgcheck=0 for that repository. +# name : A unique identifier for the repository or registry. +# +# 7. omnia_repo_url_rhel_aarch64 +#-------------------------------- +# Same as above but for RHEL aarch64. +# +# 8. additional_repos_x86_64 +#---------------------------- +# Optional list of additional repository URLs for x86_64 architecture. +# These repos are aggregated into a single Pulp repository, allowing dynamic +# addition/removal without changing compute node configurations. +# Fields: +# url : Base URL of the repository (required) +# gpgkey : GPG key URL (required, can be empty - disables gpgcheck) +# name : Unique name for the repository (required) +# sslcacert : Path to SSL CA certificate (optional) +# sslclientkey : Path to SSL client key (optional) +# sslclientcert : Path to SSL client certificate (optional) +# Notes: +# - All repos are synced into a single aggregated Pulp repository +# - Compute nodes are configured once with a fixed URL that never changes +# - Policy is controlled globally via repo_config in software_config.json (per-entry policy not supported) +# - Name must be unique within this list and must not conflict with names in other repo keys +# - Packages from these repos can only be used via additional_packages.json +# +# 9. additional_repos_aarch64 +#----------------------------- +# Same as above but for aarch64 architecture. + +# ================================ +# VARIABLES +# ================================ +# user_registry: +# - { host: "172.16.107.254:4000", cert_path: "/opt/omnia/domain.crt", key_path: "/opt/omnia/domain.key" } +user_registry: +{% set _user_registry = local_repo_user_registry | default([]) %} +{% if (_user_registry | length) > 0 %} +{% for _reg in _user_registry %} + - { host: {{ (_reg.host | default('')) | to_json }}, cert_path: {{ (_reg.cert_path | default('')) | to_json }}, key_path: {{ (_reg.key_path | default('')) | to_json }} } +{% endfor %} +{% endif %} +# user_repo_url_x86_64: +# - { url: "", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_slurm_custom" } +user_repo_url_x86_64: +{% set _user_repo_url_x86_64 = local_repo_user_repo_url_x86_64 | default([]) %} +{% if (_user_repo_url_x86_64 | length) > 0 %} +{% for _repo in _user_repo_url_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +user_repo_url_aarch64: +{% set _user_repo_url_aarch64 = local_repo_user_repo_url_aarch64 | default([]) %} +{% if (_user_repo_url_aarch64 | length) > 0 %} +{% for _repo in _user_repo_url_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +#Example: +# rhel_os_url_x86_64: +# - { url: "http://crb.com/CRB/x86_64/os/", gpgkey: "http://crb.com/CRB/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_codeready-builder"} +# - { url: "http://BaseOS.com/BaseOS/x86_64/os/", gpgkey: "http://BaseOS.com/BaseOS/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_baseos"} +# - { url: "http://AppStream.com/AppStream/x86_64/os/", gpgkey: "http://AppStream.com/AppStream/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_appstream" } +rhel_os_url_x86_64: +{% set _rhel_os_url_x86_64 = local_repo_rhel_os_url_x86_64 | default([]) %} +{% if (_rhel_os_url_x86_64 | length) > 0 %} +{% for _repo in _rhel_os_url_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, policy: {{ (_repo.policy | default('')) | to_json }} } +{% endfor %} +{% endif %} +rhel_os_url_aarch64: +{% set _rhel_os_url_aarch64 = local_repo_rhel_os_url_aarch64 | default([]) %} +{% if (_rhel_os_url_aarch64 | length) > 0 %} +{% for _repo in _rhel_os_url_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, policy: {{ (_repo.policy | default('')) | to_json }} } +{% endfor %} +{% endif %} +# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +omnia_repo_url_rhel_x86_64: +{% set _omnia_repo_url_rhel_x86_64 = local_repo_omnia_repo_url_rhel_x86_64 | default([]) %} +{% if (_omnia_repo_url_rhel_x86_64 | length) > 0 %} +{% for _repo in _omnia_repo_url_rhel_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +omnia_repo_url_rhel_aarch64: +{% set _omnia_repo_url_rhel_aarch64 = local_repo_omnia_repo_url_rhel_aarch64 | default([]) %} +{% if (_omnia_repo_url_rhel_aarch64 | length) > 0 %} +{% for _repo in _omnia_repo_url_rhel_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +# Example: +# additional_repos_x86_64: +# - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" } +# - { url: "https://repo.example.com/x86_64/", gpgkey: "", name: "custom-repo", sslcacert: "/path/ca.crt", sslclientkey: "/path/client.key", sslclientcert: "/path/client.crt" } +additional_repos_x86_64: +{% set _additional_repos_x86_64 = local_repo_additional_repos_x86_64 | default([]) %} +{% if (_additional_repos_x86_64 | length) > 0 %} +{% for _repo in _additional_repos_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }} } +{% endfor %} +{% endif %} +additional_repos_aarch64: +{% set _additional_repos_aarch64 = local_repo_additional_repos_aarch64 | default([]) %} +{% if (_additional_repos_aarch64 | length) > 0 %} +{% for _repo in _additional_repos_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }} } +{% endfor %} +{% endif %} diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2 index 98d3073c0f..d9e41ba469 100644 --- a/upgrade/roles/import_input_parameters/templates/network_spec.j2 +++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2 @@ -1,14 +1,55 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# This file is used to specify the network configuration. +# +# 'admin_network' is a mandatory field, essential for PXE boot and host communication." +# +# The 'admin_network' section contains the following variables: +# - 'oim_nic_name': The name of the interface on the OIM server associated with the admin network. +# - 'netmask_bits': The number of bits in the subnet mask. +# - 'primary_oim_admin_ip': The admin IP address of the OIM server which is configured. +# - 'primary_oim_bmc_ip': The iDRAC IP address of the OIM server, +# Mandatory only if idrac_telemetry is set to true and telemetry data needs to be collected from the OIM server. +# Optional — can be omitted if iDRAC telemetry for the OIM server is not required. +# - 'dynamic_range': The range of dynamic IP addresses available on the admin network. +# - 'dns': The list of external DNS server IP address for the admin network. +# - 'ntp_servers': The list of NTP servers for the admin network. Each NTP server entry should include: +# - 'address': The IP address or hostname of the NTP server. +# - 'type': The type of NTP entry, either 'server' or 'pool'. +# Example: +# ntp_servers: +# - { address: "172.16.10.80", type: "server" } + +# 'ib_network' is a mandatory field, essential for IB network configuration. +# The 'ib_network' section contains the following variables: +# - 'subnet': The subnet of the IB network. +# - 'netmask_bits': The number of bits in the subnet mask. This value must be same as the admin_network netmask_bits. + +Networks: +- admin_network: + oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}" + netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" + primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}" +{% if (admin_network.primary_oim_bmc_ip is defined) and ((admin_network.primary_oim_bmc_ip | string | trim) != '') %} + primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip }}" +{% endif %} + dynamic_range: "{{ admin_network.dynamic_range | default('') }}" + dns: {{ admin_network.dns | default([]) }} + ntp_servers: {{ admin_network.ntp_servers | default([]) }} + +- ib_network: + subnet: "{{ ib_network.subnet | default('192.168.0.0') }}" + netmask_bits: "{{ ib_network.netmask_bits | default(admin_network_netmask_bits | default('24')) }}" diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 new file mode 100644 index 0000000000..aec7a05ab7 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 @@ -0,0 +1,160 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# -----------------------------SLURM------------------------------------------------ +# slurm_cluster +# List of slurm clusters +# cluster_name is required field + +# nfs_storage_name +# Storage name corresponding to the NFS share to be used by slurm cluster +# This should match with exactly with a entry in storage_config.yml + +# config_sources +# defines how the Slurm configuration files are provided to the cluster. +# : +# or +# Supply the configuration values directly as a key–value map +# Supply the absolute path to a custom configuration file +# The conf files supported by slurm are +# slurm +# cgroup +# slurmdbd +# gres +# Thes files will be written into the slurm_config directory with .conf suffix + +slurm_cluster: +{% set _slurm_cluster = omnia_slurm_cluster | default([]) %} +{% if (_slurm_cluster | length) > 0 %} +{% for _cluster in _slurm_cluster %} + - cluster_name: {{ _cluster.cluster_name | default('') }} + nfs_storage_name: {{ _cluster.nfs_storage_name | default('') }} +{% if _cluster.config_sources is defined and (_cluster.config_sources | length > 0) %} + config_sources: +{% set _supported = ['slurm', 'cgroup', 'slurmdbd', 'gres'] %} +{% for _conf_name, _conf_val in _cluster.config_sources.items() %} +{% if _conf_name in _supported %} +{% if _conf_name == 'cgroup' and (_conf_val is mapping) %} + cgroup: + CgroupPlugin: {{ _conf_val.CgroupPlugin | default('autodetect') }} +{% for _k, _v in _conf_val.items() %} +{% if _k not in ['AllowedRAMSpace', 'CgroupPlugin', 'ConstrainCores', 'ConstrainDevices', 'ConstrainRAMSpace', 'ConstrainSwapSpace'] %} + {{ _k }}: {{ _v }} +{% endif %} +{% endfor %} + ConstrainCores: {{ _conf_val.ConstrainCores | default(true) }} + ConstrainDevices: {{ _conf_val.ConstrainDevices | default(true) }} + ConstrainRAMSpace: {{ _conf_val.ConstrainRAMSpace | default(true) }} + ConstrainSwapSpace: {{ _conf_val.ConstrainSwapSpace | default(true) }} +{% if _conf_val.AllowedRAMSpace is defined %} + ### AllowedRAMSpace: {{ _conf_val.AllowedRAMSpace }} This is not supported in 2.1, just attached for reference +{% endif %} +{% elif _conf_val is mapping %} + {{ _conf_name }}: +{% for _k, _v in _conf_val.items() %} + {{ _k }}: {{ _v }} +{% endfor %} +{% else %} + {{ _conf_name }}: {{ _conf_val }} +{% endif %} +{% endif %} +{% endfor %} + # OR + + # config_sources: + # slurm: /path/to/custom_slurm.conf + # cgroup: /path/to/custom_cgroup.conf + # slurmdbd: /path/to/custom_slurmdbd.conf + # gres: /path/to/custom_gres.conf +{% else %} + # config_sources: + # slurm: + # SlurmctldTimeout: 60 + # SlurmdTimeout: 150 + # cgroup: + # CgroupPlugin: autodetect + # ConstrainCores: True + # ConstrainDevices: True + # ConstrainRAMSpace: True + # ConstrainSwapSpace: True + + # OR + + # config_sources: + # slurm: /path/to/custom_slurm.conf + # cgroup: /path/to/custom_cgroup.conf + # slurmdbd: /path/to/custom_slurmdbd.conf + # gres: /path/to/custom_gres.conf +{% endif %} +{% endfor %} +{% endif %} + +# ----------------------------SERVICE K8S------------------------------------------------------ +# For service k8s cluster below parameters are required,(List) +# - cluster_name is required field + +# - deployment: Exactly one entry in both the service_k8s_cluster lists must have deployment set to true to indicate where Kubernetes should be deployed. +# Please ensure corresponding cluster entry is added to high_availability_config.yml if deployment is set to true. + +# - Kubernetes SDN network.K8s_cni (Mandatory) - It can either be "calico" or "flannel".Default value assigned is "calico". +# While setting up Kubernetes plugin for RoCE NIC, ensure that this value is set to "flannel" + +# - pod_external_ip_range: (Mandatory) These addresses will be used by Loadbalancer for assigning External IPs to K8s services +# Make sure the IP range is not assigned to any node in the cluster. +# Acceptable formats: "10.11.0.100-10.11.0.150" , "10.11.0.0/16" + +# - k8s_service_addresses: Kubernetes internal network for services.This network must be unused in your network infrastructure. +# Default value is "10.233.0.0/18" + +# - k8s_pod_network_cidr: Kubernetes pod network CIDR for internal network. When used, it will assign IP addresses from this range to individual pods. +# This network must be unused in your network infrastructure. +# Default value is "10.233.64.0/18" + +# nfs_storage_name : The nfs name should be same as one of the nfs name defined in storage_config.yml to configure the server. +# ----------------------------CSI Driver------------------------------------------------------ +# Following csi powerscale driver input variables are mandatory only if csi_driver_powerscale entry is present in software_config.json +# csi_powerscale_driver_secret_file_path: Absolute file path for the secret.yaml file. +# User need to download secret.yaml file and fill required data in secret file. Provided the path of the secret file here. +# File path for the values.yml file which will contain the Powerscale driver configuration parameters. +# csi_powerscale_driver_values_file_path: User need to download values.yaml file and fill required data in values.yaml file. +# Provided the path of the values.yaml file here. mention configurable values + +# - k8s_crio_storage_size: Specifies the disk size allocated for CRI-O container storage. +# This storage is used to store container images, writable layers, and runtime data. +# Acceptable formats: "10G", "15G", "50G" (Only positive values in Gigabytes are allowed) +# Default value is "20G" + + +service_k8s_cluster: +{% set _service_k8s_cluster = omnia_service_k8s_cluster | default([]) %} +{% if (_service_k8s_cluster | length) > 0 %} +{% for _cluster in _service_k8s_cluster %} + - cluster_name: {{ _cluster.cluster_name | default('') }} + deployment: {{ _cluster.deployment | default(false) }} + k8s_cni: {{ _cluster.k8s_cni | default('calico') }} + pod_external_ip_range: "{{ _cluster.pod_external_ip_range | default('') }}" + k8s_service_addresses: "{{ _cluster.k8s_service_addresses | default('') }}" + k8s_pod_network_cidr: "{{ _cluster.k8s_pod_network_cidr | default('') }}" + nfs_storage_name: "{{ _cluster.nfs_storage_name | default('') }}" + csi_powerscale_driver_secret_file_path: "{{ _cluster.csi_powerscale_driver_secret_file_path | default('') }}" + csi_powerscale_driver_values_file_path: "{{ _cluster.csi_powerscale_driver_values_file_path | default('') }}" + k8s_crio_storage_size: {{ _cluster.k8s_crio_storage_size | default('20G') }} +{% endfor %} +{% endif %} diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 new file mode 100644 index 0000000000..4b3b63d8c7 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 @@ -0,0 +1,48 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Provision credentials +provision_password: "{{ provision_password | default('') }}" +bmc_username: "{{ bmc_username | default('') }}" +bmc_password: "{{ bmc_password | default('') }}" + +# Prepare_oim credentials +minio_s3_password: "{{ minio_s3_password | default('') }}" +pulp_password: "{{ pulp_password | default('') }}" +docker_username: "{{ docker_username | default('') }}" +docker_password: "{{ docker_password | default('') }}" + +# Omnia credentials +slurm_db_password: "{{ slurm_db_password | default('') }}" + +# Security credentials +openldap_db_username: "{{ openldap_db_username | default('') }}" +openldap_db_password: "{{ openldap_db_password | default('') }}" + +# iDrac Telemetry credentials +mysqldb_user: "{{ mysqldb_user | default('') }}" +mysqldb_password: "{{ mysqldb_password | default('') }}" +mysqldb_root_password: "{{ mysqldb_root_password | default('') }}" + +# csi powerscale credentials +csi_username: "{{ csi_username | default('') }}" +csi_password: "{{ csi_password | default('') }}" + +# LDMS sampler +ldms_sampler_password: "{{ ldms_sampler_password | default('') }}" + +# postgres credentials +postgres_user: "{{ postgres_user | default('') }}" +postgres_password: "{{ postgres_password | default('') }}" diff --git a/upgrade/roles/import_input_parameters/templates/provision_config.j2 b/upgrade/roles/import_input_parameters/templates/provision_config.j2 new file mode 100644 index 0000000000..01fd84b2cf --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/provision_config.j2 @@ -0,0 +1,40 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +#### Mandatory +# This depicts the path where user has kept the PXE mapping file. +# The mapping file consists of the Service tag, Admin MAC,Hostname and its respective admin IP address and/or BMC IP. +# Ensure that admin IPs given in mapping file are within the network defined in the network_spec.yml +# A templates for mapping file exists in omnia/examples, namely, pxe_mapping_file.csv +# Format of csv: FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +pxe_mapping_file_path: "{{ provision_pxe_mapping_file_path }}" + +#### Mandatory +# Language that needs to be set during OS provisioning. +# Only language supported is "en_US.UTF-8" +language: "{{ provision_language }}" + +#### Mandatory +# Default lease time needs to be used by DHCP +# Unit: seconds +# Min: 21600 +# Default: 86400 +# Max: 31536000 +default_lease_time: "{{ provision_default_lease_time }}" diff --git a/upgrade/roles/import_input_parameters/templates/storage_config.j2 b/upgrade/roles/import_input_parameters/templates/storage_config.j2 new file mode 100644 index 0000000000..f6be3642c4 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/storage_config.j2 @@ -0,0 +1,78 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# -----------------------------Powervault------------------------------------------- +# powervault_config +# ip: ipv4 +# A list of PowerVault controller IP addresses used for iSCSI target discovery and login. +# In this configuration, a single controller portal is provided. + +# port: +# Defines the TCP port for the iSCSI target service. +# Port 3260 is the standard port for iSCSI communication. + +# isci_initiators: +# Specifies the InitiatorName used by the host when connecting to the iSCSI target. +# This IQN uniquely identifies the host to the storage array. + +# volume_id: +# This is the unique WWN/identifier for the +# specific volume that should be used for persistent storage. +# The script uses this value during multipath scanning to select the correct mapped device + +#powervault_config: +# ip: +# - 172.1.2.3 +# port: 3260 +# isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 +# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 + +# -----------------------------NFS------------------------------------------------ + +# This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node +# This takes a list of dicts with possible keys server_ip, server_share_path, client_share_path, client_mount_options +# In both the cases, the USER must manually update 'server_ip' and 'server_share_path' below with the correct values. +# If mount_option values are empty, NFS client will be mounted with these values "nosuid,rw,sync,hard,intr" +# Its mandatory to provide atleast one entry in nfs_client_params +# Example for single mount file system: +# nfs_client_params: +# nfs_name : str ,Name of the NFS storage resource. The default is "nfs_storage_default". +# The user can assign any custom string to specify a different NFS storage resource. +# - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} +# Example for supporting multiple mount points: +# nfs_client_params: +# - { server_ip: 198.168.0.1,server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} +# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} +# Example for multiple mount file system: +# nfs_client_params: +# - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard"} +# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} + +nfs_client_params: +{% set _nfs = storage_nfs_client_params | default([]) %} +{% for _entry in _nfs %} + - server_ip: "{{ _entry.server_ip | default('') }}" # Provide the IP of the NFS server + server_share_path: "{{ _entry.server_share_path | default('') }}" # Provide server share path of the NFS Server + client_share_path: {{ _entry.client_share_path | default('') }} + client_mount_options: "{{ _entry.client_mount_options | default('nosuid,rw,sync,hard,intr') }}" +{% if _entry.nfs_name is defined %} + nfs_name: {{ _entry.nfs_name }} +{% endif %} + +{% endfor %} diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 new file mode 100644 index 0000000000..cb89944e1c --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -0,0 +1,242 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# ============================================================================ +# TELEMETRY CONFIGURATION OVERVIEW +# ============================================================================ +# This file configures telemetry data collection and storage for Dell Omnia. +# +# SECTIONS: +# 1. iDRAC Telemetry : Hardware metrics from Dell PowerEdge servers +# 2. VictoriaMetrics : Time-series database for metric storage +# 3. Kafka : Distributed streaming platform for telemetry data +# 4. LDMS : Lightweight Distributed Metric Service for compute nodes +# +# ============================================================================ +# STORAGE REQUIREMENTS SUMMARY +# ============================================================================ +# +# VICTORIAMETRICS STORAGE: +# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ +# │ Deployment Mode │ Per-Pod Storage │ Number of Pods │ Total Storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Single-node │ persistence_size │ 1 pod │ 1× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Cluster │ persistence_size │ 3 vmstorage │ 3× storage │ +# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ +# Example: 8Gi per pod → Single-node: 8Gi total, Cluster: 24Gi total +# +# KAFKA STORAGE: +# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ +# │ Component │ Per-Pod Storage │ Number of Pods │ Total Storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Kafka Broker │ persistence_size │ 3 pods │ 3× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Kafka Controller│ persistence_size │ 3 pods │ 3× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ TOTAL KAFKA │ persistence_size │ 6 pods │ 6× storage │ +# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ +# Example: 8Gi per pod → 48Gi total Kafka storage +# +# COMBINED STORAGE EXAMPLES: +# Default (8Gi each): VictoriaMetrics Cluster (24Gi) + Kafka (48Gi) = 72Gi total +# Single-node mode: VictoriaMetrics Single (8Gi) + Kafka (48Gi) = 56Gi total +# +# STORAGE OPTIONS: +# - VictoriaMetrics: Store iDRAC telemetry in time-series database +# - Kafka: Stream iDRAC and LDMS telemetry to Kafka topics +# - Both: Store iDRAC in both Victoria and Kafka (recommended) +# ============================================================================ + +# ============================================================================ +# iDRAC TELEMETRY CONFIGURATION +# ============================================================================ +# iDRAC telemetry collects hardware metrics from Dell PowerEdge servers. +# Telemetry data can be stored in VictoriaMetrics, Kafka, or both. + +# Enable or disable iDRAC telemetry support +# Accepted values: true or false +# Default: true +idrac_telemetry_support: {{ telemetry_idrac_telemetry_support | default(true) | bool | ternary('true', 'false') }} + +# Specify where to store iDRAC telemetry data +# Supported values: +# - "victoria" : Store in VictoriaMetrics only +# - "kafka" : Store in Kafka only +# - "victoria,kafka" : Store in both (recommended) +# Default: "victoria,kafka" +idrac_telemetry_collection_type: {{ telemetry_idrac_telemetry_collection_type | default('victoria,kafka') | to_json }} + +# ============================================================================ +# VICTORIAMETRICS CONFIGURATION +# ============================================================================ +# VictoriaMetrics is a time-series database for storing telemetry metrics. +# Used for iDRAC telemetry when 'victoria' is enabled in idrac_telemetry_collection_type. +# +# DEPLOYMENT MODES: +# - single-node: Simple deployment with one pod (suitable for small deployments) +# - cluster: High-availability deployment with multiple components +# (recommended for production and large-scale deployments) +victoria_configurations: + # VictoriaMetrics deployment mode + # Supported values: + # - "single-node" : Simple deployment (1 pod, suitable for dev/test) + # - "cluster" : High-availability deployment (7 pods, recommended for production) + # Default: "cluster" + # + # Cluster Mode Benefits: + # - High availability (no single point of failure) + # - Horizontal scalability (scale components independently) + # - Better performance (4x ingestion, 2x query speed) + # - Production-ready architecture + # + # Single-Node Benefits: + # - Simple setup (fewer resources) + # - Suitable for small deployments (<10 nodes) + # - Lower resource usage (~4Gi memory vs ~10Gi for cluster) + deployment_mode: {{ telemetry_victoria_deployment_mode | default('cluster') | to_json }} + + # The amount of storage allocated for EACH VictoriaMetrics persistent volume. + # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode: + # - Single-node mode: Total storage = persistence_size × 1 pod + # - Cluster mode: Total storage = persistence_size × 3 vmstorage pods + # - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 24Gi total storage for cluster mode) + persistence_size: {{ telemetry_victoria_persistence_size | default('8Gi') | to_json }} + + # Duration (in hours) to retain victoria logs before they are deleted. + # Default: 168 (7 days) + retention_period: {{ telemetry_victoria_retention_period | default(168) }} + +# ============================================================================ +# KAFKA CONFIGURATION +# ============================================================================ +# Apache Kafka is a distributed streaming platform for storing telemetry data. +# Used for iDRAC telemetry when 'kafka' is enabled in idrac_telemetry_collection_type. +# Also used for LDMS telemetry when LDMS software is configured. +# +# NOTE: Kafka topics are auto-generated based on enabled features: +# - 'idrac' topic: Required when idrac_telemetry_support=true and 'kafka' is enabled +# - 'ldms' topic: Required when LDMS is configured in software_config.json +kafka_configurations: + # The amount of storage allocated for EACH Kafka persistent volume. + # IMPORTANT: Total Kafka storage = persistence_size × 6 pods + # - 3 Kafka brokers (each gets persistence_size storage) + # - 3 Kafka controllers (each gets persistence_size storage) + # - Example: 8Gi × 6 = 48Gi total Kafka storage + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 48Gi total Kafka storage) + persistence_size: {{ telemetry_kafka_persistence_size | default('8Gi') | to_json }} + + # The number of hours to retain Kafka logs before they are deleted. + # Default: 168 (7 days) + log_retention_hours: {{ telemetry_kafka_log_retention_hours | default(168) }} + + # The maximum size of Kafka logs (in bytes) before they are deleted. + # Default: -1 (unlimited) + log_retention_bytes: {{ telemetry_kafka_log_retention_bytes | default(-1) }} + + # The maximum size of Kafka log segments (in bytes) before they are deleted. + # Default: 1073741824 (1 GB) + log_segment_bytes: {{ telemetry_kafka_log_segment_bytes | default(1073741824) }} + + # Kafka Topic Partitions Configuration + # ---------------------------------------------------------------------------- + # Define the number of partitions for each Kafka topic. + # Increasing partitions can improve throughput but also increases storage/overhead. + # + # IMPORTANT: Topic names are FIXED and cannot be changed. + # - Topic names: Only 'idrac' and 'ldms' are allowed + # - Configurable: Only partition counts can be modified + # + # Topic Requirements (auto-validated): + # - 'idrac': Required when idrac_telemetry_support=true and 'kafka' is enabled + # - 'ldms': Required when LDMS software is configured in software_config.json + # + # Default partition counts: idrac=1, ldms=2 + topic_partitions: +{% for _topic in (telemetry_kafka_topic_partitions | default([])) %} + - name: {{ _topic.name | default('') | to_json }} + partitions: {{ _topic.partitions | default(1) }} +{% endfor %} + +# ============================================================================ +# LDMS (Lightweight Distributed Metric Service) CONFIGURATION +# ============================================================================ +# LDMS collects performance metrics from compute nodes (CPU, memory, network, etc.) +# and streams them to Kafka for storage and analysis. +# +# PREREQUISITE: To enable LDMS support, add the following to software_config.json: +# { +# "softwares": [ +# {"name": "ldms", "arch": ["x86_64", "aarch64"]} +# ] +# } +# +# When LDMS software is configured, the 'ldms' topic MUST be defined in +# kafka_configurations.topic_partitions above. +# +# LDMS Port Configurations +# Aggregator port on service k8s cluster +# Valid range: 6001-6100 +# Default: 6001 +ldms_agg_port: {{ telemetry_ldms_agg_port | default(6001) }} + +# Store daemon port on service k8s cluster +# Can be the same as ldms_agg_port +# Valid range: 6001-6100 +# Default: 6001 +ldms_store_port: {{ telemetry_ldms_store_port | default(6001) }} + +# Sampler port on compute nodes +# Valid range: 10001-10100 +# Default: 10001 +ldms_sampler_port: {{ telemetry_ldms_sampler_port | default(10001) }} + +# LDMS Sampler Plugin Configurations +# ---------------------------------------------------------------------------- +# Configure which metrics to collect from compute nodes and collection intervals. +# Each plugin collects specific system metrics. +# +# Parameters: +# - plugin_name: Name of the LDMS sampler plugin +# - config_parameters: Plugin-specific configuration (as a single string) +# - activation_parameters: Collection schedule in MICROSECONDS +# Format: "interval= offset=" +# Example: "interval=1000000" (1000000 microseconds = 1 second) +# "interval=1000000 offset=0" (1000000 microseconds with no offset) +# +# Available Plugins: +# - meminfo: Memory usage statistics +# - procstat2: Process statistics +# - vmstat: Virtual memory statistics +# - loadavg: System load average +# - procnetdev2: Network interface statistics +ldms_sampler_configurations: +{% if telemetry_ldms_sampler_configurations is none %} + null +{% else %} +{% for _plugin in (telemetry_ldms_sampler_configurations | default([])) %} + - plugin_name: {{ _plugin.plugin_name | default('') }} + config_parameters: {{ _plugin.config_parameters | default('') | to_json }} + activation_parameters: {{ _plugin.activation_parameters | default('interval=1000000') | to_json }} +{% endfor %} +{% endif %} diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index f4c5b1b7cb..ebaa33e492 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -12,3 +12,225 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +# backup_location will be set from oim_metadata.yml upgrade_backup_dir +# Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default +# Set dynamically from metadata, no static variable needed + +# Path to oim_metadata.yml +oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" + +backup_dir_mode: '0755' +default_file_mode: '0644' + +# List to collect warnings during execution +upgrade_warnings: [] + +# Precheck backup location messages +msg_backup_location_missing: "backup_location must be provided" +msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.data/oim_metadata.yml" + +# Restore input files messages +msg_restore_item_name_missing: "restore_item must define 'name'" +msg_validation_failed: "Validation failed for {{ restore_item.name }}" +msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" +msg_user_registry_credential_missing: |- + WARNING: user_registry_credential.yml not found in backup at + {{ backup_location }}/user_registry_credential.yml + This might be due to complete Omnia execution not being completed. + Skipping restoration of this file. + +# Omnia config credentials messages +msg_omnia_config_credentials_missing: |- + WARNING: omnia_config_credentials.yml not found in backup at + {{ backup_location }}/omnia_config_credentials.yml. + This might be due to complete Omnia execution not being completed. + Skipping restoration of this file. + +msg_omnia_config_credentials_info_missing: |- + INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key + are not present in backup. This is expected if credentials + were not configured in the source installation. + +msg_omnia_config_credentials_success: |- + omnia_config_credentials.yml restored and updated from backup. + Backup: {{ backup_location }}/omnia_config_credentials.yml + Target: {{ input_project_dir }}/omnia_config_credentials.yml + Status: Updated with postgres credentials and re-encrypted (key file also restored) + +msg_omnia_config_credentials_error: |- + ERROR: Inconsistent state detected for omnia_config_credentials.yml: + {% if not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %} + - File is encrypted but key file (.omnia_config_credentials_key) is missing + {% elif backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %} + - Key file exists but file is not encrypted + {% endif %} + Please check the backup integrity and ensure both files are present + in consistent states. + +# Rescue warning messages +msg_user_registry_decrypt_error: |- + ERROR: Failed to decrypt user_registry_credential.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file + matches the encrypted file. + +msg_omnia_config_decrypt_error: |- + ERROR: Failed to decrypt omnia_config_credentials.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file + matches the encrypted file. + +msg_omnia_config_template_error: |- + ERROR: Failed to generate updated omnia_config_credentials.yml. + Template processing may have failed due to invalid data format. + Please check the backup file format and ensure it contains valid YAML. + +msg_omnia_config_encrypt_error: |- + ERROR: Failed to encrypt updated omnia_config_credentials.yml. + The key file may be corrupted or there may be permission issues. + Please check the key file integrity and file permissions. + +msg_decryption_failed: "Decryption failed. Check warnings for details." +msg_template_failed: "Template processing failed. Check warnings for details." +msg_encryption_failed: "Encryption failed. Check warnings for details." + +# Network spec transformation messages +msg_backup_network_spec_missing: "Backup network_spec.yml missing" +msg_network_spec_missing: "network_spec.yml missing" +msg_network_spec_already_21: "network_spec.yml already in 2.1 format - overwriting" +msg_yaml_validation_failed: "YAML validation failed" +msg_ib_netmask_mismatch: "ib_network.netmask_bits must match admin_network.netmask_bits" +msg_ib_network_missing: "ib_network is mandatory" +msg_ib_subnet_missing: "ib_network.subnet is mandatory" +msg_using_backup_network_spec: "Using backup network_spec.yml (backup not modified)" + +# High availability config transformation messages +msg_backup_ha_config_missing: "Backup high_availability_config.yml missing" +msg_ha_config_missing: "high_availability_config.yml missing" +msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting" +msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory" +msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)" + +# Local repo config transformation messages +msg_backup_local_repo_config_missing: "Backup local_repo_config.yml missing" +msg_local_repo_config_missing: "local_repo_config.yml missing" +msg_using_backup_local_repo_config: "Using backup local_repo_config.yml (backup not modified)" +msg_omnia_repo_url_rhel_x86_64_missing: "omnia_repo_url_rhel_x86_64 is mandatory" +msg_omnia_repo_url_rhel_aarch64_missing: "omnia_repo_url_rhel_aarch64 is mandatory" + +# Provision config transformation messages +msg_backup_provision_config_missing: "Backup provision_config.yml missing" +msg_provision_config_missing: "provision_config.yml missing" +msg_using_backup_provision_config: "Using backup provision_config.yml (backup not modified)" +msg_pxe_mapping_file_path_missing: "pxe_mapping_file_path is mandatory" + +# Storage config transformation messages +msg_backup_storage_config_missing: "storage_config.yml not found in backup at {{ backup_location }}/storage_config.yml" +msg_storage_config_missing: "storage_config.yml not found at {{ input_project_dir }}/storage_config.yml" +msg_nfs_client_params_missing: "storage_config.yml must define nfs_client_params with at least one entry" +msg_nfs_client_param_entry_missing_keys: "Each nfs_client_params entry must define server_ip, server_share_path, and client_share_path" +msg_using_backup_storage_config: "Transforming storage_config.yml from backup at {{ backup_location }}/storage_config.yml" + +# Omnia config transformation messages +msg_backup_omnia_config_missing: "Backup omnia_config.yml missing" +msg_omnia_config_missing: "omnia_config.yml missing" +msg_using_backup_omnia_config: "Using backup omnia_config.yml (backup not modified)" +msg_slurm_cluster_missing: "slurm_cluster is mandatory" +msg_service_k8s_cluster_missing: "service_k8s_cluster is mandatory" + +# Telemetry config transformation messages +msg_backup_telemetry_config_missing: "Backup telemetry_config.yml missing" +msg_telemetry_config_missing: "telemetry_config.yml missing" +msg_using_backup_telemetry_config: "Using backup telemetry_config.yml (backup not modified)" + +### Restore summary messages +msg_restore_summary: | + {{ restore_item.name }} restored from backup. + Backup: {{ backup_location }}/{{ restore_item.name }} + Target: {{ input_project_dir }}/{{ restore_item.name }} + +# Restore summary message for network spec transformation +msg_network_spec_transform_summary: | + network_spec.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/network_spec.yml + Changes: + - Added mandatory ib_network + - Made primary_oim_bmc_ip optional + - Aligned ib_network.netmask_bits with admin_network.netmask_bits + +# Restore summary message for high availability config transformation +msg_ha_config_transform_summary: | + high_availability_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/high_availability_config.yml + Changes: + - Ensured service_k8s_cluster_ha is a list + - Ensured virtual_ip_address is present + +# Restore summary message for local repo config transformation +msg_local_repo_config_transform_summary: | + local_repo_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/local_repo_config.yml + Changes: + - Normalized repo URL keys to arch-specific schema + - Migrated omnia_registry to user_registry (when present) + - Ensured mandatory omnia_repo_url_rhel_* keys are present + +# Restore summary message for provision config transformation +msg_provision_config_transform_summary: | + provision_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/provision_config.yml + Changes: + - Ensured pxe_mapping_file_path, language, and default_lease_time are present + +# Restore summary message for storage config transformation +msg_storage_config_transform_summary: | + storage_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/storage_config.yml + Changes: + - Ensured nfs_client_params is present and entries contain required keys + +# Restore summary message for omnia config transformation +msg_omnia_config_transform_summary: | + omnia_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/omnia_config.yml + Changes: + - Ensured slurm_cluster and service_k8s_cluster are lists + - Ensured required sections are present + +# Restore summary message for telemetry config transformation +msg_telemetry_config_transform_summary: | + telemetry_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/telemetry_config.yml + Changes: + - Rendered Omnia 2.1 telemetry template with values from 2.0 backup + - Applied schema defaults for missing fields + +# === Input files to restore from backup === +# Add input files here that should be copied from backup_location to input_project_dir +# Each entry should have: +# - name: filename (required) +# - mode: file permissions (optional, defaults to default_file_mode) +# - validate_cmd: validation command (optional, runs after restore) +# +# Examples of files to add: +# - Static configuration files that don't need transformation +# - Files that are the same format in 2.0 and 2.1 +# - Files where you want to preserve the backup values exactly +# +# DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml, local_repo_config.yml, +# provision_config.yml, user_registry_credential.yml) +restore_input_files: + - name: software_config.json + mode: '0644' + validate_cmd: "python3 -m json.tool '{{ input_project_dir }}/software_config.json'" + - name: security_config.yml + mode: '0644' + validate_cmd: "python3 -c \"import yaml; yaml.safe_load(open('{{ input_project_dir }}/security_config.yml','r'))\"" + - name: pxe_mapping_file.csv + mode: '0644' + validate_cmd: "" diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 196366870b..90b25611b5 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -13,6 +13,55 @@ # limitations under the License. --- -- name: Include import input parameters - ansible.builtin.include_role: - name: import_input_parameters + +- name: Display cluster reprovision guidance + ansible.builtin.pause: + prompt: "{{ '\x1b[32m' }}=================================================== + CLUSTER REPROVISION REQUIRED + =========================================================== + + Cluster reprovisioning is required after upgrade to enable new features. + + Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning: + + 1. local_repo_config.yml + + - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64) + + - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64) + + 2. network_spec.yml (ib_network section) + + - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable) + + - Ensure host IB interfaces map to the IB network entries + + 3. omnia_config.yml (slurm_cluster.config_source) + + - Use the new structure: config_source: { type: , location: } + + - Populate location to point to your Slurm config bundle (local path or remote URL) + + Do NFS cleanup (if NFS share is used for k8s/slurm) + + - Clean stale mounts and ensure the NFS share is accessible before reprovision + + - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment + + + Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster: + + 1. ansible-playbook local_repo/local_repo.yml + + 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml + + 3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64: + + ansible-playbook build_image_aarch64/build_image_aarch64.yml + + 4. ansible-playbook discovery/discovery.yml + + Please follow the omnia documentation for steps in more detail. + + {{ '\x1b[0m' }}" + seconds: 1 diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml new file mode 100644 index 0000000000..c0d5080c22 --- /dev/null +++ b/upgrade/rollback_omnia.yml @@ -0,0 +1,54 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Rollback Omnia guidance + hosts: localhost + connection: local + gather_facts: false + vars: + oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" + tasks: + - name: Read oim_metadata.yml for backup details + ansible.builtin.slurp: + src: "{{ oim_metadata_path }}" + register: oim_metadata_slurp + ignore_errors: true + + - name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" + when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined + + - name: Derive backup_version from upgrade_backup_dir + ansible.builtin.set_fact: + backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1')) + | default('previous version', true) }}" + when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined + + - name: Display rollback guidance (green) + ansible.builtin.debug: + msg: + - "=================================" + - " OMNIA ROLLBACK" + - "=================================" + - "" + - "[Rollback Actions]" + - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)." + - "2. Target version: {{ backup_version | default('previous version from the backup location') }}." + - "3. How to run:" + - " - Exit the Omnia core container shell if you are inside it." + - " - From the OIM host prompt, execute: ./omnia.sh --rollback" + - "4. Note: ensure the backup location is accessible on the OIM host before running rollback." + - name: End play + ansible.builtin.meta: end_play diff --git a/upgrade/upgrade_oim.yml b/upgrade/upgrade_oim.yml index 3e91f1a479..aa6e6fb5fc 100644 --- a/upgrade/upgrade_oim.yml +++ b/upgrade/upgrade_oim.yml @@ -17,4 +17,5 @@ hosts: localhost connection: local roles: + - role: ../utils/roles/include_input_dir - role: upgrade_oim diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml index 61050ec244..ade6b1f173 100644 --- a/upgrade/upgrade_omnia.yml +++ b/upgrade/upgrade_omnia.yml @@ -18,3 +18,13 @@ - name: Upgrade cluster tasks ansible.builtin.import_playbook: upgrade_cluster.yml + +- name: Clear upgrade guard lock + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Remove upgrade guard lock + ansible.builtin.file: + path: /opt/omnia/.data/upgrade_in_progress.lock + state: absent diff --git a/utils/credential_utility/get_config_credentials.yml b/utils/credential_utility/get_config_credentials.yml index 0e4c323b94..b77ba14b9b 100644 --- a/utils/credential_utility/get_config_credentials.yml +++ b/utils/credential_utility/get_config_credentials.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../include_input_dir.yml diff --git a/utils/external_kafka_connect_details.yml b/utils/external_kafka_connect_details.yml new file mode 100644 index 0000000000..a55c54ad3b --- /dev/null +++ b/utils/external_kafka_connect_details.yml @@ -0,0 +1,64 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Preflight - validate inventory + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Load Kafka utility role variables + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/external_kafka_connect_details/vars/main.yml" + + - name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + + - name: Set HA config path + ansible.builtin.set_fact: + k8s_ha_config_path: "{{ input_project_dir }}/high_availability_config.yml" + + - name: Load High Availability config + ansible.builtin.include_vars: + file: "{{ k8s_ha_config_path }}" + name: ha_config + failed_when: false + register: ha_config_load + + - name: Fail when High Availability config cannot be loaded + ansible.builtin.fail: + msg: "{{ kafka_preflight_err_ha_config_missing }}" + when: ha_config_load.failed + + - name: Set service kube control plane VIP from HA config + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + + - name: Fail when service kube control plane VIP is not available + ansible.builtin.fail: + msg: "{{ kafka_preflight_err_ha_vip_missing }}" + when: (kube_vip | trim | length) == 0 + + - name: Create service_kube_control_plane group from VIP + ansible.builtin.add_host: + name: "{{ kube_vip }}" + groups: service_kube_control_plane + +- name: Fetch external Kafka connection details + hosts: service_kube_control_plane + connection: ssh + gather_facts: false + roles: + - external_kafka_connect_details diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml new file mode 100644 index 0000000000..23e388baf6 --- /dev/null +++ b/utils/external_victoria_connect_details.yml @@ -0,0 +1,64 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Preflight - validate inventory + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Load Victoria utility role variables + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/external_victoria_connect_details/vars/main.yml" + + - name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + + - name: Set HA config path + ansible.builtin.set_fact: + k8s_ha_config_path: "{{ input_project_dir }}/high_availability_config.yml" + + - name: Load High Availability config + ansible.builtin.include_vars: + file: "{{ k8s_ha_config_path }}" + name: ha_config + failed_when: false + register: ha_config_load + + - name: Fail when High Availability config cannot be loaded + ansible.builtin.fail: + msg: "{{ victoria_preflight_err_ha_config_missing }}" + when: ha_config_load.failed + + - name: Set service kube control plane VIP from HA config + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + + - name: Fail when service kube control plane VIP is not available + ansible.builtin.fail: + msg: "{{ victoria_preflight_err_ha_vip_missing }}" + when: (kube_vip | trim | length) == 0 + + - name: Create service_kube_control_plane group from VIP + ansible.builtin.add_host: + name: "{{ kube_vip }}" + groups: service_kube_control_plane + +- name: Fetch external Victoria connection details + hosts: service_kube_control_plane + connection: ssh + gather_facts: false + roles: + - external_victoria_connect_details diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml index edb9cfb207..4d959d5ea4 100644 --- a/utils/oim_cleanup.yml +++ b/utils/oim_cleanup.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: include_input_dir.yml diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml new file mode 100644 index 0000000000..3ee17c1c80 --- /dev/null +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -0,0 +1,219 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate service k8s controller connectivity + block: + - name: Wait for service k8s controller connection + ansible.builtin.wait_for_connection: + timeout: 30 + rescue: + - name: Fail when service k8s controller is not reachable + ansible.builtin.fail: + msg: "{{ kafka_preflight_err_service_k8s_controller_unreachable }}" + +- name: Check kubectl presence + ansible.builtin.command: kubectl version --client=true + register: kubectl_check + changed_when: false + failed_when: kubectl_check.rc != 0 + +- name: Delete Kafka output directory (clean start) + ansible.builtin.file: + path: "{{ kafka_output_dir }}" + state: absent + delegate_to: localhost + connection: local + run_once: true + +- name: Get Kafka pod status + ansible.builtin.command: >- + kubectl get pods -n {{ kafka_namespace }} + -l app.kubernetes.io/name=kafka + -o wide + register: kafka_pods + changed_when: false + failed_when: false + +- name: Get Kafka pod status (json) + ansible.builtin.command: >- + kubectl get pods -n {{ kafka_namespace }} + -l app.kubernetes.io/name=kafka + -o json + register: kafka_pods_json + changed_when: false + failed_when: kafka_pods_json.rc != 0 + +- name: Parse Kafka pods + ansible.builtin.set_fact: + kafka_pods_parsed: "{{ kafka_pods_json.stdout | from_json }}" + +- name: Fail if no Kafka pods found + ansible.builtin.fail: + msg: "{{ kafka_err_no_pods_found }}" + when: (kafka_pods_parsed.get('items', []) | length) == 0 + +- name: Fail if Kafka pods are not Running + ansible.builtin.fail: + msg: "{{ kafka_err_pods_not_running }}" + when: + - (kafka_pods_parsed.get('items', []) + | selectattr('status.phase', 'ne', 'Running') + | list + | length) > 0 + +- name: Fail if Kafka pods are not Ready + ansible.builtin.fail: + msg: "{{ kafka_err_pods_not_ready }}" + when: + - (kafka_pods_parsed.get('items', []) + | selectattr('status.containerStatuses', 'defined') + | map(attribute='status.containerStatuses') + | list + | flatten + | selectattr('ready', 'equalto', false) + | list + | length) > 0 + +- name: Get Kafka LoadBalancer IP + ansible.builtin.command: >- + kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: kafka_lb_ip + changed_when: false + failed_when: kafka_lb_ip.rc != 0 + +- name: Set Kafka external endpoint + ansible.builtin.set_fact: + kafka_external_ip: "{{ kafka_lb_ip.stdout | trim }}" + kafka_external_port: "{{ kafka_bootstrap_port | string }}" + +- name: Fail when Kafka external endpoint is not available + ansible.builtin.fail: + msg: "{{ kafka_err_external_ip_missing }}" + when: kafka_external_ip | trim | length == 0 + +- name: Ensure output directory exists + ansible.builtin.file: + path: "{{ kafka_output_dir }}" + state: directory + mode: "0755" + delegate_to: localhost + connection: local + run_once: true + +- name: Read Kafka cluster CA cert from secret + ansible.builtin.command: >- + kubectl get secret {{ kafka_cluster_ca_secret }} -n {{ kafka_namespace }} + -o jsonpath='{.data.ca\.crt}' + register: kafka_ca_crt_b64 + changed_when: false + failed_when: kafka_ca_crt_b64.rc != 0 or (kafka_ca_crt_b64.stdout | trim | length == 0) + +- name: Read Kafka client cert from secret + ansible.builtin.command: >- + kubectl get secret {{ kafka_client_secret }} -n {{ kafka_namespace }} + -o jsonpath='{.data.user\.crt}' + register: kafka_user_crt_b64 + changed_when: false + failed_when: kafka_user_crt_b64.rc != 0 or (kafka_user_crt_b64.stdout | trim | length == 0) + +- name: Read Kafka client key from secret + ansible.builtin.command: >- + kubectl get secret {{ kafka_client_secret }} -n {{ kafka_namespace }} + -o jsonpath='{.data.user\.key}' + register: kafka_user_key_b64 + changed_when: false + failed_when: kafka_user_key_b64.rc != 0 or (kafka_user_key_b64.stdout | trim | length == 0) + +- name: Write Kafka CA/cert/key files + ansible.builtin.copy: + content: "{{ item.content }}" + dest: "{{ item.dest }}" + mode: "0600" + loop: + - dest: "{{ kafka_output_dir }}/ca.crt" + content: "{{ kafka_ca_crt_b64.stdout | b64decode }}" + - dest: "{{ kafka_output_dir }}/user.crt" + content: "{{ kafka_user_crt_b64.stdout | b64decode }}" + - dest: "{{ kafka_output_dir }}/user.key" + content: "{{ kafka_user_key_b64.stdout | b64decode }}" + delegate_to: localhost + connection: local + run_once: true + +- name: Build Kafka connection details + ansible.builtin.set_fact: + kafka_connect_details: + kafka: + namespace: "{{ kafka_namespace }}" + loadbalancer_service: "{{ kafka_lb_service_name }}" + pod_status: "{{ kafka_pods.stdout | default('') }}" + bootstrap_server: "{{ kafka_external_ip }}:{{ kafka_external_port }}" + tls: + ca_crt: "{{ kafka_output_dir }}/ca.crt" + client_crt: "{{ kafka_output_dir }}/user.crt" + client_key: "{{ kafka_output_dir }}/user.key" + +- name: Ensure output file directory exists + ansible.builtin.file: + path: "{{ kafka_output_file | dirname }}" + state: directory + mode: "0755" + delegate_to: localhost + connection: local + run_once: true + +- name: Write Kafka connection details to file + ansible.builtin.copy: + content: "{{ kafka_connect_details | to_nice_yaml }}" + dest: "{{ kafka_output_file }}" + mode: "0644" + delegate_to: localhost + connection: local + run_once: true + +- name: Display Kafka connection details + ansible.builtin.debug: + msg: >- + {{ + [ + 'Kafka connection details written to: ' ~ kafka_output_file, + '', + '[IMPORTANT] Kafka external endpoint: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, + '', + '[IMPORTANT] TLS files (on OIM host):', + ' CA (server certificate for OME): ' ~ kafka_output_dir ~ '/ca.crt', + ' client cert: ' ~ kafka_output_dir ~ '/user.crt', + ' client key: ' ~ kafka_output_dir ~ '/user.key', + '', + 'OME steps (mTLS):', + ' [STEP 1] Create client certificate in .pfx format (passphrase required):', + ' cd ' ~ kafka_output_dir, + ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', + ' [STEP 2] ' ~ kafka_ome_cross_machine_note_line1, + ' ' ~ kafka_ome_cross_machine_note_line2, + ' [STEP 3] In the OME UI, navigate to:', + ' ' ~ kafka_ome_ui_navigation_line1, + ' [STEP 4] Click: ' ~ kafka_ome_ui_enable_label, + ' [STEP 5] Set Kafka Bootstrap Server to: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, + ' [STEP 6] Set Authentication Mode to: ' ~ kafka_ome_auth_mode_value, + ' [STEP 7] ' ~ kafka_ome_server_cert_note, + ' [STEP 8] ' ~ kafka_ome_client_cert_note, + '' + ] + }} + delegate_to: localhost + connection: local + run_once: true diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml new file mode 100644 index 0000000000..be23cde089 --- /dev/null +++ b/utils/roles/external_kafka_connect_details/vars/main.yml @@ -0,0 +1,53 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +kafka_namespace: "telemetry" +kafka_lb_service_name: "kafka-kafka-external-bootstrap" +kafka_bootstrap_port: 9094 +kafka_cluster_ca_secret: "kafka-cluster-ca-cert" +kafka_client_secret: "kafkapump" +kafka_output_dir: "/opt/omnia/telemetry/external_kafka" +kafka_output_file: "/opt/omnia/telemetry/external_kafka_connect_details.yml" + +kafka_err_no_pods_found: "No Kafka pods found in namespace '{{ kafka_namespace }}'." +kafka_err_pods_not_running: "One or more Kafka pods are not in Running state." +kafka_err_pods_not_ready: "One or more Kafka pods are not Ready." + +kafka_err_external_ip_missing: >- + Failed to fetch Kafka LoadBalancer external IP. Ensure service '{{ kafka_lb_service_name }}' + exists in namespace '{{ kafka_namespace }}' and has an external IP assigned. + +kafka_preflight_err_ha_config_missing: >- + Failed to load High Availability config file: {{ k8s_ha_config_path }}. + Provide a valid HA config so the service Kubernetes VIP can be used. + +kafka_preflight_err_ha_vip_missing: >- + Failed to determine the service Kubernetes control plane VIP from High Availability config. + Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}. + +kafka_preflight_err_service_k8s_controller_unreachable: >- + Service Kubernetes controller is not reachable over SSH: {{ ansible_host | default(inventory_hostname) }}. + Ensure the service Kubernetes VIP is reachable and resolvable from the OIM host. + +kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity" +kafka_ome_ui_enable_label: "Enable Kafka Connectivity" +kafka_ome_auth_mode_value: "SSL" + +kafka_ome_server_cert_note: "Upload ca.crt as the server certificate in OME." +kafka_ome_client_cert_note: "Upload user.pfx as the client certificate in OME (mTLS)." +kafka_ome_cross_machine_note_line1: >- + If OME UI is accessed from a different system than the OIM host, +kafka_ome_cross_machine_note_line2: >- + copy ca.crt and user.pfx to that system before uploading them in the UI. diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml new file mode 100644 index 0000000000..260c8376fd --- /dev/null +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -0,0 +1,325 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate service k8s controller connectivity + block: + - name: Wait for service k8s controller connection + ansible.builtin.wait_for_connection: + timeout: 30 + rescue: + - name: Fail when service k8s controller is not reachable + ansible.builtin.fail: + msg: "{{ victoria_preflight_err_service_k8s_controller_unreachable }}" + +- name: Check kubectl presence + ansible.builtin.command: kubectl version --client=true + register: kubectl_check + changed_when: false + failed_when: kubectl_check.rc != 0 + +- name: Check for Victoria cluster services + ansible.builtin.command: >- + kubectl get svc {{ item }} -n {{ victoria_namespace }} -o name + loop: + - vminsert + - vmselect + register: victoria_cluster_svcs + changed_when: false + failed_when: false + +- name: Check for Victoria single-node service + ansible.builtin.command: >- + kubectl get svc victoria-loadbalancer -n {{ victoria_namespace }} -o name + register: victoria_single_svc + changed_when: false + failed_when: false + +- name: Set Victoria deployment mode + ansible.builtin.set_fact: + victoria_deployment_mode: >- + {{ + 'cluster' + if (victoria_cluster_svcs.results | selectattr('rc', 'equalto', 0) | list | length) == 2 + else ('single-node' if victoria_single_svc.rc == 0 else 'unknown') + }} + +- name: Fail if Victoria cluster mode is not deployed + ansible.builtin.fail: + msg: "{{ victoria_err_mode_not_supported }}" + when: victoria_deployment_mode != 'cluster' + +- name: Get Victoria pods status + ansible.builtin.command: >- + kubectl get pods -n {{ victoria_namespace }} + -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" + -o wide + register: victoria_pods_wide + changed_when: false + failed_when: victoria_pods_wide.rc != 0 + +- name: Get Victoria pods status (json) + ansible.builtin.command: >- + kubectl get pods -n {{ victoria_namespace }} + -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" + -o json + register: victoria_pods_json + changed_when: false + failed_when: victoria_pods_json.rc != 0 + +- name: Parse Victoria pods + ansible.builtin.set_fact: + victoria_pods_parsed: "{{ victoria_pods_json.stdout | from_json }}" + +- name: Fail if no Victoria pods found + ansible.builtin.fail: + msg: "{{ victoria_err_no_pods_found }}" + when: (victoria_pods_parsed.get('items', []) | length) == 0 + +- name: Fail if Victoria pods are not Running + ansible.builtin.fail: + msg: "{{ victoria_err_pods_not_running }}" + when: + - (victoria_pods_parsed.get('items', []) + | selectattr('status.phase', 'ne', 'Running') + | list + | length) > 0 + +- name: Fail if Victoria pods are not Ready + ansible.builtin.fail: + msg: "{{ victoria_err_pods_not_ready }}" + when: + - (victoria_pods_parsed.get('items', []) + | selectattr('status.containerStatuses', 'defined') + | map(attribute='status.containerStatuses') + | list + | flatten + | selectattr('ready', 'equalto', false) + | list + | length) > 0 + +- name: Get vminsert service LoadBalancer IP + ansible.builtin.command: >- + kubectl get svc vminsert -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: vminsert_lb_ip + changed_when: false + failed_when: vminsert_lb_ip.rc != 0 + +- name: Get vminsert service LoadBalancer hostname + ansible.builtin.command: >- + kubectl get svc vminsert -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' + register: vminsert_lb_hostname + changed_when: false + failed_when: vminsert_lb_hostname.rc != 0 + +- name: Get vminsert service external port + ansible.builtin.command: >- + kubectl get svc vminsert -n {{ victoria_namespace }} + -o jsonpath='{.spec.ports[0].port}' + register: vminsert_lb_port + changed_when: false + failed_when: vminsert_lb_port.rc != 0 + +- name: Get vmselect service LoadBalancer IP + ansible.builtin.command: >- + kubectl get svc vmselect -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: vmselect_lb_ip + changed_when: false + failed_when: vmselect_lb_ip.rc != 0 + +- name: Get vmselect service LoadBalancer hostname + ansible.builtin.command: >- + kubectl get svc vmselect -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' + register: vmselect_lb_hostname + changed_when: false + failed_when: vmselect_lb_hostname.rc != 0 + +- name: Get vmselect service external port + ansible.builtin.command: >- + kubectl get svc vmselect -n {{ victoria_namespace }} + -o jsonpath='{.spec.ports[0].port}' + register: vmselect_lb_port + changed_when: false + failed_when: vmselect_lb_port.rc != 0 + +- name: Set endpoint facts + ansible.builtin.set_fact: + vminsert_host: >- + {{ + (vminsert_lb_ip.stdout | trim) + if (vminsert_lb_ip.stdout | trim | length) > 0 + else (vminsert_lb_hostname.stdout | trim) + }} + vmselect_host: >- + {{ + (vmselect_lb_ip.stdout | trim) + if (vmselect_lb_ip.stdout | trim | length) > 0 + else (vmselect_lb_hostname.stdout | trim) + }} + vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}" + vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}" + victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt" + +- name: Fail when LoadBalancer IPs are not available + ansible.builtin.fail: + msg: "{{ victoria_err_lb_missing }}" + when: + - vminsert_host | trim | length == 0 or vmselect_host | trim | length == 0 + +- name: Build SFM hosts entry + ansible.builtin.set_fact: + victoria_sfm_hosts_entry: >- + {{ + 'echo ' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' + if (vminsert_lb_ip.stdout | trim | length) > 0 + else '' + }} + +- name: Build SFM hosts entry for vmselect + ansible.builtin.set_fact: + victoria_sfm_hosts_entry_vmselect: >- + {{ + 'echo ' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' + if (vmselect_lb_ip.stdout | trim | length) > 0 + else '' + }} + +- name: Set endpoint urls and SFM note strings + ansible.builtin.set_fact: + victoria_vminsert_write_url: >- + https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write + victoria_vmselect_query_url: >- + https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/prometheus/api/v1/query + victoria_vmselect_ui_url: >- + https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/vmui + victoria_sfm_hosts_entry_vminsert_display: >- + {{ + victoria_sfm_hosts_entry + if (victoria_sfm_hosts_entry | length) > 0 + else 'vminsert LoadBalancer IP not available; cannot generate /etc/hosts entry.' + }} + victoria_sfm_hosts_entry_vmselect_display: >- + {{ + victoria_sfm_hosts_entry_vmselect + if (victoria_sfm_hosts_entry_vmselect | length) > 0 + else 'vmselect LoadBalancer IP not available; cannot generate /etc/hosts entry.' + }} + +- name: Set Victoria external port fallbacks + ansible.builtin.set_fact: + vminsert_port: "8480" + vmselect_port: "8481" + when: + - vminsert_port | trim | length == 0 or vmselect_port | trim | length == 0 + +- name: Build connection details + ansible.builtin.set_fact: + victoria_connect_details: + victoria: + namespace: "{{ victoria_namespace }}" + deployment_mode: "{{ victoria_deployment_mode }}" + pod_status: "{{ victoria_pods_wide.stdout }}" + base_url: "https://{{ vminsert_host }}:{{ vminsert_port }}" + endpoints: + vminsert: + host: "{{ vminsert_host }}" + port: "{{ vminsert_port | int }}" + write_endpoint: "https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write" + vmselect: + host: "{{ vmselect_host }}" + port: "{{ vmselect_port | int }}" + query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" + ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" + tls: + ca_crt: "{{ victoria_tls_ca }}" + notes: + sfm: + vminsert_write_url: "{{ victoria_vminsert_write_url }}" + hosts_entry: "{{ victoria_sfm_hosts_entry }}" + hosts_entry_vmselect: "{{ victoria_sfm_hosts_entry_vmselect }}" + ui_navigation: "{{ victoria_sfm_ui_navigation }}" + remote_write_target_name: "{{ victoria_sfm_remote_write_target_name }}" + remote_write_message_version: "{{ victoria_sfm_remote_write_message_version }}" + remote_write_enable_value: "{{ victoria_sfm_remote_write_enable_value }}" + tls_server_cert_file_name: "{{ victoria_sfm_tls_server_cert_file_name }}" + tls_server_cert_file_path: "{{ victoria_tls_ca }}" + ssh_note: "{{ victoria_sfm_ssh_note }}" + hosts_scope_note: "{{ victoria_sfm_hosts_scope_note }}" + pod_shell_command_example: "{{ victoria_sfm_pod_shell_command_example }}" + hosts_restart_note: "{{ victoria_sfm_hosts_restart_note }}" + +- name: Ensure output directory exists + ansible.builtin.file: + path: "{{ victoria_output_file | dirname }}" + state: directory + mode: "0755" + delegate_to: localhost + connection: local + run_once: true + +- name: Write connection details to file + ansible.builtin.copy: + content: "{{ victoria_connect_details | to_nice_yaml }}" + dest: "{{ victoria_output_file }}" + mode: "0644" + delegate_to: localhost + connection: local + run_once: true + +- name: Display Victoria connection details + ansible.builtin.debug: + msg: >- + {{ + [ + 'Victoria connection details written to: ' ~ victoria_output_file, + '', + 'Mode: ' ~ victoria_deployment_mode, + '', + 'Endpoints:', + ' [IMPORTANT] vminsert write: ' ~ victoria_vminsert_write_url, + ' vmselect query: ' ~ victoria_vmselect_query_url, + ' vmselect UI: ' ~ victoria_vmselect_ui_url, + '', + 'TLS:', + ' ca.crt: ' ~ victoria_tls_ca, + '', + 'SFM steps (TLS):', + ' [STEP 1] ' ~ victoria_sfm_cross_machine_tls_note_line1, + ' ' ~ victoria_sfm_cross_machine_tls_note_line2, + ' [STEP 2] In the SFM UI, update the vminsert URL:', + ' ' ~ victoria_sfm_ui_navigation, + ' Edit target: ' ~ victoria_sfm_remote_write_target_name, + ' Set Enable to: ' ~ victoria_sfm_remote_write_enable_value, + ' Set URL to: ' ~ victoria_vminsert_write_url, + ' Set Message Version to: ' ~ victoria_sfm_remote_write_message_version, + ' TLS Config: Upload ' ~ victoria_sfm_tls_server_cert_file_name, + ' as ' ~ victoria_sfm_tls_server_cert_file_label ~ ': ' ~ victoria_tls_ca, + ' [STEP 3] ' ~ victoria_sfm_ssh_note, + ' [STEP 4] Update /etc/hosts only inside the SFM Prometheus pod:', + ' ' ~ victoria_sfm_hosts_scope_note, + ' ' ~ victoria_sfm_pod_shell_command_example, + ' Add these entries inside the pod:', + ' ' ~ victoria_sfm_hosts_entry_vminsert_display, + ' ' ~ victoria_sfm_hosts_entry_vmselect_display, + ' [NOTE] ' ~ victoria_sfm_hosts_restart_note, + '' + ] + }} + delegate_to: localhost + connection: local + run_once: true diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml new file mode 100644 index 0000000000..f9a1fb72dd --- /dev/null +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -0,0 +1,62 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +victoria_namespace: "telemetry" +victoria_output_file: "/opt/omnia/telemetry/external_victoria_connect_details.yml" +victoria_tls_cert_dir: "/opt/omnia/telemetry/victoria-certs" + +victoria_err_mode_not_supported: >- + Victoria deployment mode detected: {{ victoria_deployment_mode }}. + External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage). + Single-node Victoria (victoria-loadbalancer) is not supported for external integration. + +victoria_err_no_pods_found: "No Victoria pods found in namespace '{{ victoria_namespace }}'." +victoria_err_pods_not_running: "One or more Victoria pods are not in Running state." +victoria_err_pods_not_ready: "One or more Victoria pods are not Ready." + +victoria_err_lb_missing: >- + Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' + exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. + +victoria_preflight_err_ha_config_missing: >- + Failed to load High Availability config file: {{ k8s_ha_config_path }}. + Provide a valid HA config so the service Kubernetes VIP can be used. + +victoria_preflight_err_ha_vip_missing: >- + Failed to determine the service Kubernetes control plane VIP from High Availability config. + Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}. + +victoria_preflight_err_service_k8s_controller_unreachable: >- + Service Kubernetes controller is not reachable over SSH: {{ ansible_host | default(inventory_hostname) }}. + Ensure the service Kubernetes VIP is reachable and resolvable from the OIM host. + +victoria_sfm_ui_navigation: "Observability -> Settings -> Prometheus Remote Write" +victoria_sfm_remote_write_target_name: "victoria" +victoria_sfm_remote_write_message_version: "v1" +victoria_sfm_remote_write_enable_value: "ON" + +victoria_sfm_ssh_note: "SSH to the SFM IP with admin credentials." +victoria_sfm_hosts_scope_note: >- + /etc/hosts update is required only inside the SFM Prometheus pod (not on the SFM server host). +victoria_sfm_pod_shell_command_example: >- + kubectl exec -it sfm-prometheus-deployment-xxxxx-xx -n sfm-1 -- /bin/sh +victoria_sfm_hosts_restart_note: "Repeat /etc/hosts update if the SFM pod restarts." +victoria_sfm_cross_machine_tls_note_line1: >- + If using the SFM UI from a different system than the OIM host, +victoria_sfm_cross_machine_tls_note_line2: >- + copy ca.crt to that system before uploading it in the UI. + +victoria_sfm_tls_server_cert_file_label: "Server Certificate File" +victoria_sfm_tls_server_cert_file_name: "ca.crt" diff --git a/utils/roles/slurm_cleanup/defaults/main.yml b/utils/roles/slurm_cleanup/defaults/main.yml new file mode 100644 index 0000000000..f54396449f --- /dev/null +++ b/utils/roles/slurm_cleanup/defaults/main.yml @@ -0,0 +1,5 @@ +--- + +slurm_share_dir_name: slurm +slurm_cleanup_pre_backup_default: 'y' +slurm_cleanup_confirm_token: 'YES' diff --git a/utils/roles/slurm_cleanup/tasks/main.yml b/utils/roles/slurm_cleanup/tasks/main.yml new file mode 100644 index 0000000000..7acd38e571 --- /dev/null +++ b/utils/roles/slurm_cleanup/tasks/main.yml @@ -0,0 +1,55 @@ +--- + +- name: Set slurm_config_path + ansible.builtin.set_fact: + slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" + tags: slurm_cleanup + +- name: Prompt for pre-cleanup backup + ansible.builtin.pause: + prompt: "Before cleanup, take a config backup? (y/n)" + register: pre_cleanup_backup + tags: slurm_cleanup + +- name: Set pre-cleanup backup choice + ansible.builtin.set_fact: + pre_cleanup_backup_choice: "{{ pre_cleanup_backup.user_input | default('') | trim | lower }}" + tags: slurm_cleanup + +- name: Fail if pre-cleanup backup choice is empty + ansible.builtin.fail: + msg: "No input provided for pre-cleanup backup prompt. Cleanup aborted." + when: pre_cleanup_backup_choice | length == 0 + tags: slurm_cleanup + +- name: Validate pre-cleanup backup choice + ansible.builtin.fail: + msg: "Invalid input '{{ pre_cleanup_backup.user_input | default('') }}'. Enter 'y' or 'n'." + when: pre_cleanup_backup_choice not in ['y', 'yes', 'n', 'no'] + tags: slurm_cleanup + +- name: Run config backup before cleanup + ansible.builtin.include_role: + name: slurm_config_backup + apply: + tags: slurm_cleanup + when: pre_cleanup_backup_choice in ['y', 'yes'] + tags: slurm_cleanup + +- name: Confirm cleanup + ansible.builtin.pause: + prompt: "This will delete {{ slurm_config_path }}. Type {{ slurm_cleanup_confirm_token }} to continue" + register: cleanup_confirm + tags: slurm_cleanup + +- name: Fail if cleanup not confirmed + ansible.builtin.fail: + msg: "Cleanup aborted" + when: cleanup_confirm.user_input != slurm_cleanup_confirm_token + tags: slurm_cleanup + +- name: Delete slurm share directory + ansible.builtin.file: + path: "{{ slurm_config_path }}" + state: absent + tags: slurm_cleanup diff --git a/utils/roles/slurm_config_backup/defaults/main.yml b/utils/roles/slurm_config_backup/defaults/main.yml new file mode 100644 index 0000000000..b631a205d0 --- /dev/null +++ b/utils/roles/slurm_config_backup/defaults/main.yml @@ -0,0 +1,4 @@ +--- + +slurm_share_dir_name: slurm +slurm_backups_dir_name: slurm_backups diff --git a/utils/roles/slurm_config_backup/tasks/main.yml b/utils/roles/slurm_config_backup/tasks/main.yml new file mode 100644 index 0000000000..401a086493 --- /dev/null +++ b/utils/roles/slurm_config_backup/tasks/main.yml @@ -0,0 +1,69 @@ +--- + +- name: Set slurm_config_path + ansible.builtin.set_fact: + slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" + +- name: Display resolved slurm config path + ansible.builtin.debug: + msg: "Resolved slurm_config_path={{ slurm_config_path }}" + +- name: Prompt for backup base name + ansible.builtin.pause: + prompt: "Enter backup base name (leave empty for timestamp-only)" + register: backup_base_name_input + +- name: Set backup id + ansible.builtin.set_fact: + backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" + backup_base_name: "{{ backup_base_name_input.user_input | default('') }}" + +- name: Set backup name suffix + ansible.builtin.set_fact: + backup_name_suffix: "{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}" + +- name: Set backup directory + ansible.builtin.set_fact: + slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}" + backup_id: "{{ backup_name_suffix }}" + backup_dir: "{{ share_path }}/{{ slurm_backups_dir_name }}/{{ backup_name_suffix }}" + +- name: Ensure slurm backups root exists + ansible.builtin.file: + path: "{{ slurm_backups_root }}" + state: directory + mode: '0755' + +- name: Display slurm backups root + ansible.builtin.debug: + msg: "Resolved slurm_backups_root={{ slurm_backups_root }}" + +- name: Create backup directory + ansible.builtin.file: + path: "{{ backup_dir }}" + state: directory + mode: '0755' + +- name: Create backup config directories + ansible.builtin.file: + path: "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}" + state: directory + mode: '0755' + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + +- name: Backup controller config directories + ansible.builtin.command: >- + cp -a "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/." "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}/" + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + changed_when: true + failed_when: false + +- name: Display backup location + ansible.builtin.debug: + msg: "Slurm config backup created at: {{ backup_dir }}/{{ ctld_list[0] }}" diff --git a/utils/roles/slurm_config_rollback/defaults/main.yml b/utils/roles/slurm_config_rollback/defaults/main.yml new file mode 100644 index 0000000000..601e25cd18 --- /dev/null +++ b/utils/roles/slurm_config_rollback/defaults/main.yml @@ -0,0 +1,5 @@ +--- + +slurm_share_dir_name: slurm +slurm_backups_dir_name: slurm_backups +slurm_rollback_backup_list_limit_default: 20 diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml new file mode 100644 index 0000000000..6e185f2028 --- /dev/null +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -0,0 +1,355 @@ +--- + +- name: Set slurm paths + ansible.builtin.set_fact: + slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" + slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}" + tags: config_rollback + +- name: Find available backups + ansible.builtin.find: + paths: "{{ slurm_backups_root }}" + file_type: directory + depth: 1 + register: backup_dirs + tags: config_rollback + +- name: Fail if no backups found + ansible.builtin.fail: + msg: "No backups found in {{ slurm_backups_root }}" + when: backup_dirs.files | length == 0 + tags: config_rollback + +- name: Set rollback backup list limit + ansible.builtin.set_fact: + rollback_backup_list_limit_effective: "{{ lookup('vars', 'rollback_backup_list_limit', default=slurm_rollback_backup_list_limit_default) | int }}" + tags: config_rollback + +- name: Build backup choices + ansible.builtin.set_fact: + backup_choices: >- + {{ + ( + backup_dirs.files + | sort(attribute='mtime', reverse=true) + | map(attribute='path') + | list + )[:(rollback_backup_list_limit_effective | int)] + }} + total_backup_count: "{{ backup_dirs.files | length }}" + tags: config_rollback + +- name: Notify if backup list is truncated + ansible.builtin.debug: + msg: "Showing latest {{ rollback_backup_list_limit_effective }} backups out of {{ total_backup_count }}. Increase rollback_backup_list_limit to show more." + when: (total_backup_count | int) > (rollback_backup_list_limit_effective | int) + tags: config_rollback + +- name: Display backup list order + ansible.builtin.debug: + msg: "Backup list is sorted latest first." + tags: config_rollback + +- name: Show backup choices + ansible.builtin.debug: + msg: "{{ backup_choice_index + 1 }}: {{ item | basename }}" + loop: "{{ backup_choices }}" + loop_control: + index_var: backup_choice_index + tags: config_rollback + +- name: Prompt user to select backup number + ansible.builtin.pause: + prompt: "Enter the backup number to rollback to" + register: backup_choice_input + tags: config_rollback + +- name: Set backup choice index + ansible.builtin.set_fact: + backup_choice_index: "{{ backup_choice_input.user_input | default('') | trim }}" + tags: config_rollback + +- name: Fail if backup selection is empty + ansible.builtin.fail: + msg: "No backup number selected. Rollback aborted." + when: backup_choice_index | length == 0 + tags: config_rollback + +- name: Validate backup choice input is within range + ansible.builtin.fail: + msg: "Invalid selection '{{ backup_choice_input.user_input | default('') }}'. Enter a number between 1 and {{ backup_choices | length }}." + when: + - (backup_choice_index | int) < 1 or (backup_choice_index | int) > (backup_choices | length) + tags: config_rollback + +- name: Set selected backup + ansible.builtin.set_fact: + selected_backup_dir: "{{ backup_choices[(backup_choice_index | int) - 1] }}" + tags: config_rollback + +- name: Set selected backup controller root + ansible.builtin.set_fact: + selected_backup_ctld_root: "{{ selected_backup_dir }}/{{ ctld_list[0] }}" + tags: config_rollback + +- name: Check slurm.conf exists in selected backup + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/etc/slurm/slurm.conf" + register: slurm_conf_stat + tags: config_rollback + +- name: Fail if slurm.conf missing in backup + ansible.builtin.fail: + msg: "Selected backup is missing {{ ctld_list[0] }}/etc/slurm/slurm.conf" + when: not slurm_conf_stat.stat.exists + tags: config_rollback + +- name: Check key slurm conf files existence in selected backup + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/etc/slurm/{{ item }}" + loop: + - slurmdbd.conf + - cgroup.conf + - gres.conf + register: slurm_conf_files_stats + tags: config_rollback + +- name: Compute missing slurm conf files in selected backup + ansible.builtin.set_fact: + missing_slurm_conf_files: "{{ slurm_conf_files_stats.results | rejectattr('stat.exists') | map(attribute='item') | list }}" + tags: config_rollback + +- name: Warn if slurm conf files are missing in selected backup + ansible.builtin.debug: + msg: "WARNING: Missing files in selected backup under etc/slurm: {{ missing_slurm_conf_files }}" + when: missing_slurm_conf_files | length > 0 + tags: config_rollback + +- name: Prompt to continue if slurm conf files are missing + ansible.builtin.pause: + prompt: "Some slurm config files are missing in the selected backup. Continue anyway? (y/N)" + register: continue_missing_confs + when: missing_slurm_conf_files | length > 0 + tags: config_rollback + +- name: Fail if user does not want to continue with missing slurm conf files + ansible.builtin.fail: + msg: "Rollback aborted" + when: + - missing_slurm_conf_files | length > 0 + - continue_missing_confs.user_input | default('N') | lower != 'y' + tags: config_rollback + +- name: Check munge.key exists in selected backup + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/etc/munge/munge.key" + register: munge_key_stat + tags: config_rollback + +- name: Warn if munge.key is missing in selected backup + ansible.builtin.debug: + msg: "WARNING: munge.key is missing in selected backup under etc/munge." + when: not munge_key_stat.stat.exists + tags: config_rollback + +- name: Prompt to continue if munge.key is missing + ansible.builtin.pause: + prompt: "munge.key is missing in the selected backup. Continue anyway? (y/N)" + register: continue_missing_munge_key + when: not munge_key_stat.stat.exists + tags: config_rollback + +- name: Fail if user does not want to continue without munge.key + ansible.builtin.fail: + msg: "Rollback aborted" + when: + - not munge_key_stat.stat.exists + - continue_missing_munge_key.user_input | default('N') | lower != 'y' + tags: config_rollback + +- name: Check backup directories + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/{{ item }}" + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + register: backup_dir_stats + tags: config_rollback + +- name: Compute missing backup directories + ansible.builtin.set_fact: + missing_backup_dirs: "{{ backup_dir_stats.results | rejectattr('stat.exists') | map(attribute='item') | list }}" + tags: config_rollback + +- name: Warn if backup directories missing + ansible.builtin.debug: + msg: "WARNING: Missing directories in backup: {{ missing_backup_dirs }}" + when: missing_backup_dirs | length > 0 + tags: config_rollback + +- name: Prompt to continue if backup directories missing + ansible.builtin.pause: + prompt: "Some directories are missing in the backup. Continue anyway? (y/N)" + register: continue_missing + when: missing_backup_dirs | length > 0 + tags: config_rollback + +- name: Fail if user does not want to continue + ansible.builtin.fail: + msg: "Rollback aborted" + when: + - missing_backup_dirs | length > 0 + - continue_missing.user_input | default('N') | lower != 'y' + tags: config_rollback + +- name: Prompt for safety backup before rollback + ansible.builtin.pause: + prompt: "Create a safety backup of current state before rollback? (y/n)" + register: pre_rollback_backup + tags: config_rollback + +- name: Set pre-rollback backup choice + ansible.builtin.set_fact: + pre_rollback_backup_choice: "{{ pre_rollback_backup.user_input | default('') | trim | lower }}" + tags: config_rollback + +- name: Fail if pre-rollback backup choice is empty + ansible.builtin.fail: + msg: "No input provided for safety backup prompt. Rollback aborted." + when: pre_rollback_backup_choice | length == 0 + tags: config_rollback + +- name: Validate pre-rollback backup choice + ansible.builtin.fail: + msg: "Invalid input '{{ pre_rollback_backup.user_input | default('') }}'. Enter 'y' or 'n'." + when: pre_rollback_backup_choice not in ['y', 'yes', 'n', 'no'] + tags: config_rollback + +- name: Run safety backup before rollback + ansible.builtin.include_role: + name: slurm_config_backup + apply: + tags: config_rollback + when: pre_rollback_backup_choice in ['y', 'yes'] + tags: config_rollback + +- name: Stat slurmdbd.conf before restore + ansible.builtin.stat: + path: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/slurmdbd.conf" + checksum_algorithm: sha1 + register: slurmdbd_before + tags: config_rollback + +- name: Restore config directories + ansible.builtin.copy: + src: "{{ selected_backup_ctld_root }}/{{ item }}/" + dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/" + remote_src: true + mode: preserve + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + changed_when: true + failed_when: false + tags: config_rollback + +- name: Check slurmdbd.conf permissions after restore + ansible.builtin.stat: + path: /etc/slurm/slurmdbd.conf + delegate_to: slurm_controller + register: slurmdbd_conf_perm_stat + tags: config_rollback + +- name: Fix slurmdbd.conf permissions after restore + ansible.builtin.file: + path: /etc/slurm/slurmdbd.conf + mode: '0600' + delegate_to: slurm_controller + when: slurmdbd_conf_perm_stat.stat.exists + tags: config_rollback + +- name: Check munge.key permissions after restore + ansible.builtin.stat: + path: /etc/munge/munge.key + delegate_to: slurm_controller + register: munge_key_perm_stat + tags: config_rollback + +- name: Fix munge.key permissions after restore + ansible.builtin.file: + path: /etc/munge/munge.key + mode: '0400' + delegate_to: slurm_controller + when: munge_key_perm_stat.stat.exists + tags: config_rollback + +- name: Stat slurmdbd.conf after restore + ansible.builtin.stat: + path: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/slurmdbd.conf" + checksum_algorithm: sha1 + register: slurmdbd_after + tags: config_rollback + +- name: Notify slurmdbd.conf changed + ansible.builtin.debug: + msg: "Detected slurmdbd.conf change after rollback; restarting slurmdbd." + when: + - slurmdbd_before.stat.exists + - slurmdbd_after.stat.exists + - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum + tags: config_rollback + +- name: Restart slurmdbd + ansible.builtin.systemd: + name: slurmdbd + state: restarted + delegate_to: slurm_controller + when: + - slurmdbd_before.stat.exists + - slurmdbd_after.stat.exists + - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum + changed_when: true + tags: config_rollback + +- name: Gather service facts on controller + ansible.builtin.service_facts: + delegate_to: slurm_controller + tags: config_rollback + +- name: Set slurmctld state + ansible.builtin.set_fact: + slurmctld_state: "{{ ansible_facts.services['slurmctld.service'].state | default('unknown') }}" + tags: config_rollback + +- name: Fail if slurmctld is not active + ansible.builtin.fail: + msg: >- + slurmctld is not active on the controller. Rollback applied on disk, but cannot + reconfigure until slurmctld is running. Verify munge and slurmctld services and + restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the + controller. + when: slurmctld_state != 'running' + tags: config_rollback + +- name: Run scontrol reconfigure + tags: config_rollback + block: + - name: Execute scontrol reconfigure + ansible.builtin.command: scontrol reconfigure + delegate_to: slurm_controller + register: reconfigure_out + changed_when: true + failed_when: reconfigure_out.rc != 0 + rescue: + - name: Display scontrol reconfigure error + ansible.builtin.debug: + msg: "scontrol reconfigure failed. stdout={{ reconfigure_out.stdout | default('') }} stderr={{ reconfigure_out.stderr | default('') }}" + + - name: Fail with rollback guidance + ansible.builtin.fail: + msg: >- + Rollback applied on disk, but scontrol reconfigure failed. Recommended action: + rollback to the safety backup created before this rollback (if you chose to + create it). diff --git a/utils/slurm_config_util.yml b/utils/slurm_config_util.yml new file mode 100644 index 0000000000..fd42e4c202 --- /dev/null +++ b/utils/slurm_config_util.yml @@ -0,0 +1,124 @@ +--- + +- name: Include input project directory + when: not project_dir_status | default(false) | bool + ansible.builtin.import_playbook: include_input_dir.yml + vars: + omnia_metadata_support: true + tags: always + +- name: Create oim group + ansible.builtin.import_playbook: create_container_group.yml + vars: + oim_group: true + tags: always + +- name: Slurm config utilities + hosts: oim + connection: ssh + gather_facts: true + tasks: + - name: Include variable file omnia_config.yml + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" + tags: always + + - name: Include storage vars + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" + tags: always + + - name: Set facts for slurm + ansible.builtin.set_fact: + nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" + tags: always + + - name: Read the slurm mount point + ansible.builtin.set_fact: + share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" + tags: always + + - name: Slurp remote YAML file + ansible.builtin.slurp: + src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" + register: slurped_yaml + tags: always + + - name: Parse YAML into vars + ansible.builtin.set_fact: + node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}" + tags: always + + - name: Get name and IP mapping 1 + ansible.builtin.set_fact: + tmp_ip_name_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='interfaces') }}" + tags: always + + - name: Get name and IP mapping 2 + ansible.builtin.set_fact: + ip_name_map: "{{ ip_name_map | default({}) | combine({item.key: item.value[0]['ip_addrs'][0]['ip_addr']}) }}" + loop: "{{ tmp_ip_name_map | dict2items }}" + tags: always + + - name: Read the node name group + ansible.builtin.set_fact: + name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}" + tags: always + + - name: Group the functional_groups + ansible.builtin.set_fact: + tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}" + tags: always + + - name: Re-organize the groups + ansible.builtin.set_fact: + grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}" + loop: "{{ tmp_grouped_nodes }}" + tags: always + + - name: Assign slurm lists + ansible.builtin.set_fact: + ctld_list: "{{ grouped_nodes | dict2items + | selectattr('key', 'match', '^' ~ 'slurm_control_node_') + | map(attribute='value') | list | flatten }}" + tags: always + + - name: Fail if Slurm controller list is empty + ansible.builtin.fail: + msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun." + when: ctld_list | length == 0 + tags: always + + - name: Set slurm controller IP + ansible.builtin.set_fact: + controller_ip: "{{ ip_name_map[ctld_list | first] }}" + when: ctld_list | length > 0 + tags: always + + - name: Add slurm controller as dynamic host + ansible.builtin.add_host: + name: slurm_controller + ansible_host: "{{ controller_ip }}" + ansible_user: root + ansible_port: 22 + when: controller_ip is defined + tags: always + + - name: Run slurm config backup + ansible.builtin.include_role: + name: slurm_config_backup + apply: + tags: config_backup + tags: config_backup + + - name: Run slurm cleanup + ansible.builtin.include_role: + name: slurm_cleanup + apply: + tags: slurm_cleanup + tags: slurm_cleanup + + - name: Run slurm config rollback + ansible.builtin.include_role: + name: slurm_config_rollback + apply: + tags: config_rollback + tags: config_rollback diff --git a/utils/upgrade_checkup.yml b/utils/upgrade_checkup.yml new file mode 100644 index 0000000000..5fb8582000 --- /dev/null +++ b/utils/upgrade_checkup.yml @@ -0,0 +1,33 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: "Guard: block if upgrade is in progress" + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Check upgrade lock file + ansible.builtin.stat: + path: /opt/omnia/.data/upgrade_in_progress.lock + register: upgrade_lock + + - name: Block playbook while upgrade is in progress + ansible.builtin.fail: + msg: >- + Upgrade is not completed fully. + Please run upgrade_omnia.yml to complete upgrade before running any other playbook using the below command: + "ansible-playbook /omnia/upgrade/upgrade_omnia.yml" + If you don't require input files to be migrated, reconfigure the default input files, remove the lock file using the following command + "rm /opt/omnia/.data/upgrade_in_progress.lock" and then proceed. + when: upgrade_lock.stat.exists