From 5dac32794c296e6c7c4c7018de573017ae898811 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 28 Jan 2026 10:11:26 +0000 Subject: [PATCH 001/172] copy file python module --- common/library/modules/parallel_file_copy.py | 174 ++++++++++++++++++ .../slurm_config/tasks/create_slurm_dir.yml | 162 +++++++++++----- discovery/roles/slurm_config/vars/main.yml | 52 +++--- 3 files changed, 315 insertions(+), 73 deletions(-) create mode 100644 common/library/modules/parallel_file_copy.py diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py new file mode 100644 index 0000000000..4f05f041c3 --- /dev/null +++ b/common/library/modules/parallel_file_copy.py @@ -0,0 +1,174 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +# pylint: disable=import-error,no-name-in-module,line-too-long + +""" +Ansible module for parallel copying of files. + +Supports copying multiple source → destination pairs in parallel, +with logging, retries, and optional cleanup. +""" + +import os +import shutil +import threading +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.local_repo.standard_logger import setup_standard_logger + +# ============================================================ +# Default Values +# ============================================================ + +DEFAULT_MAX_WORKERS = 4 +DEFAULT_RETRY_COUNT = 2 +DEFAULT_DELETE_EXISTING = True + +# ============================================================ +# Copy Worker Function +# ============================================================ + +def copy_single_file(src_file, dest_dir, retry_count, delete_existing, slogger, summary): + """Copy one directory pair with retry support.""" + thread_name = threading.current_thread().name + start_time = datetime.now() + + if not os.path.isfile(src_file): + slogger.info(f"NOT COPIED - Source file missing: {src_file}") + summary["skipped"].append(src_file) + return + + os.makedirs(dest_dir, exist_ok=True) + dest_file = os.path.join(dest_dir, os.path.basename(src_file)) + + for attempt in range(1, retry_count + 1): + try: + slogger.info(f"[{thread_name}] START {start_time} Copying {src_file} (Attempt {attempt})") + + if delete_existing and os.path.exists(dest_file): + os.remove(dest_file) + slogger.info(f"Deleted existing file: {dest_file}") + + shutil.copy2(src_file, dest_file) + + end_time = datetime.now() + duration = (end_time - start_time).total_seconds() + slogger.info(f"[{thread_name}] SUCCESS {end_time} Copied {src_file} -> {dest_file} (Duration={duration:.2f}s)") + + summary["copied"].append(src_file) + return + + except Exception as err: + slogger.error(f"[{thread_name}] ERROR copying {src_file} (Attempt {attempt}) Reason: {err}") + if attempt == retry_count: + summary["failed"].append(src_file) + +# ============================================================ +# Main Parallel Copy Logic +# ============================================================ + +def execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger): + """ + Executes parallel copy for all pairs. + Returns summary dict. + """ + summary = {"copied": [], "skipped": [], "failed": []} + futures = [] + + slogger.info("===== PARALLEL FILE COPY STARTED =====") + slogger.info(f"Copy pairs received: {copy_pairs}") + slogger.info(f"Max workers: {max_workers}") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for src_dir, dest_dir in copy_pairs: + + if not os.path.isdir(src_dir): + slogger.info(f"NOT COPIED - Source directory missing: {src_dir}") + summary["skipped"].append(src_dir) + continue + + files = [os.path.join(src_dir, f) for f in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, f))] + if not files: + slogger.info(f"NOT COPIED - No files found in directory: {src_dir}") + summary["skipped"].append(src_dir) + continue + + # ⚡ Show Ansible warning for in-progress copy + module.warn(f"Copy in progress for {src_dir} -> {dest_dir}. Please wait ...") + + slogger.info(f"Copying {len(files)} files from {src_dir} -> {dest_dir} ...") + + for file_path in files: + futures.append(executor.submit(copy_single_file, file_path, dest_dir, retry_count, delete_existing, slogger, summary)) + + # Wait for all copies to finish + for future in as_completed(futures): + future.result() + + slogger.info("===== PARALLEL FILE COPY FINISHED =====") + return summary + +# ============================================================ +# Ansible Module Entry Point +# ============================================================ + +def main(): + """Main Ansible module execution entrypoint.""" + module_args = dict( + copy_pairs=dict(type="list", required=True), + max_workers=dict(type="int", required=False, default=DEFAULT_MAX_WORKERS), + retry_count=dict(type="int", required=False, default=DEFAULT_RETRY_COUNT), + delete_existing=dict(type="bool", required=False, default=DEFAULT_DELETE_EXISTING), + slog_file=dict(type="str", required=False, default="/tmp/parallel_copy.log"), + ) + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + copy_pairs = module.params["copy_pairs"] + max_workers = module.params["max_workers"] + retry_count = module.params["retry_count"] + delete_existing = module.params["delete_existing"] + slog_file = module.params["slog_file"] + + slogger = setup_standard_logger(slog_file) + + result = dict(changed=False, copied=[], skipped=[], failed=[]) + + try: + summary = execute_parallel_copy(module, copy_pairs, max_workers, retry_count, delete_existing, slogger) + + result["copied"] = summary["copied"] + result["skipped"] = summary["skipped"] + result["failed"] = summary["failed"] + if summary["copied"]: + result["changed"] = True + + overall_status = "SUCCESS" + if summary["failed"] and summary["copied"]: + overall_status = "PARTIAL" + elif summary["failed"] and not summary["copied"]: + overall_status = "FAILURE" + + result["overall_status"] = overall_status + module.exit_json(**result) + + except Exception as err: + slogger.error(f"Parallel copy execution failed: {err}") + module.fail_json(msg=str(err), **result) + +if __name__ == "__main__": + main() diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 662802274b..18ee917fb7 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -1,4 +1,4 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,34 +18,6 @@ - name: Include storage vars ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml" -- name: Load slurm_custom.json for x86_64 - ansible.builtin.include_vars: - file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" - name: slurm_custom_x86_64 - failed_when: false - -- name: Load slurm_custom.json for aarch64 - ansible.builtin.include_vars: - file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" - name: slurm_custom_aarch64 - failed_when: false - -- name: Extract CUDA runfile name for x86_64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_x86_64 is defined - - slurm_custom_x86_64.slurm_node is defined - - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - -- name: Extract CUDA runfile name for aarch64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_aarch64 is defined - - slurm_custom_aarch64.slurm_node is defined - - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" @@ -63,6 +35,10 @@ slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" controller_trackfile_path: "{{ share_path }}/ctld_track" +- name: Build parallel copy list for HPC tools + ansible.builtin.set_fact: + parallel_copy_pairs: [] + - name: Configure openldap if supported ansible.builtin.include_tasks: openldap_config.yml when: hostvars['localhost']['openldap_support'] @@ -131,8 +107,49 @@ mode: "{{ file_mode }}" become: true -- name: Create hpc tools dirs - ansible.builtin.include_tasks: hpc_tools.yml +- name: Create HPC tools directories on share + ansible.builtin.file: + path: "{{ slurm_config_path }}/hpc_tools/{{ item }}" + state: directory + owner: root + group: root + mode: "{{ common_mode }}" + loop: + - cuda + - runfile + - scripts + - container_images + - nvidia_sdk + - benchmarks + +- name: Set NFS info fact + ansible.builtin.set_fact: + oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" + +- name: Initialize parallel copy pairs + ansible.builtin.set_fact: + parallel_copy_pairs: [] + +- name: Check which parallel copy source directories exist + ansible.builtin.stat: + path: "{{ item.src }}" + loop: "{{ parallel_copy_candidates }}" + register: copy_source_checks + failed_when: false + +- name: Add only valid copy pairs (source exists) + ansible.builtin.set_fact: + parallel_copy_pairs: >- + {{ parallel_copy_pairs + + [[ item.item.src, item.item.dest ]] }} + loop: "{{ copy_source_checks.results }}" + when: item.stat.exists + +- name: Parallel copy HPC tool files + parallel_file_copy: + copy_pairs: "{{ parallel_copy_pairs }}" + max_workers: "{{ parallel_copy_max_workers }}" + when: parallel_copy_pairs | length > 0 - name: Check if munge key exists top level ansible.builtin.stat: @@ -156,8 +173,71 @@ (compiler_login_list | default([])) + (login_list | default([])) }}" -- name: Conf merge and write using slurm_conf module - ansible.builtin.include_tasks: confs.yml +- name: Slurm path ops + ansible.builtin.set_fact: + conf_path_items: "{{ conf_path_items | default({}) | combine({item.key: item.value}) }}" + when: item.value is string + loop: "{{ configs_input | dict2items }}" + +- name: Slurm dict ops + ansible.builtin.set_fact: + conf_dict_items: "{{ conf_dict_items | default({}) | combine({item.key: item.value}) }}" + when: item.value is mapping + loop: "{{ configs_input | dict2items }}" + +- name: Slurm dict ops + ansible.builtin.set_fact: + apply_config: >- + {{ apply_config | default({}) + | combine({ + item: ( + (__default_config[item] | default({})) + | combine(conf_dict_items[item] | default({})) + ) + }) + }} + loop: "{{ conf_files }}" + +- name: Read NodeName parameters + ansible.builtin.include_tasks: read_node_idrac.yml + when: cmpt_list + loop: "{{ cmpt_list }}" + +- name: Copy conf file if provided + ansible.builtin.copy: + src: "{{ conf_path_items.get(item.1) }}" + dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" + mode: "{{ conf_file_mode }}" + remote_src: "{{ copy_from_oim }}" + when: ctld_list + loop: "{{ ctld_list | product(conf_path_items.keys() | default([])) }}" + +- name: Add gpu parameters to slurm conf + ansible.builtin.set_fact: + apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" + when: gpu_params is defined and gpu_params + +- name: Verify slurm conf keys only + ansible.builtin.assert: + that: + - (apply_config[item].keys() | list) | difference(__conf_keys[item]) | length == 0 + fail_msg: "The following {{ item }} config keys are invalid: {{ apply_config[item].keys() | list | difference(__conf_keys[item]) | join(', ') }}" + when: apply_config[item] and __conf_keys[item] + loop: "{{ conf_files }}" + +- name: Slurm dict ops + ansible.builtin.set_fact: + slurm_conf_dict: "{{ apply_config['slurm'] }}" + +- name: Create all .conf for ctld only + ansible.builtin.template: + src: "{{ item.1 }}.conf.j2" + dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "{{ conf_file_mode }}" + when: ctld_list + loop: "{{ ctld_list | product(conf_files | difference(conf_path_items.keys() | default([]))) }}" - name: Create mariadb cnf ansible.builtin.template: @@ -215,19 +295,3 @@ ansible.builtin.set_fact: cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}" client_mount_path: "{{ share_path }}" - -- name: Ensure SSH key directory exists on Slurm share - ansible.builtin.file: - path: "{{ slurm_config_path }}/ssh" - state: directory - owner: root - group: root - mode: '0700' - -- name: Copy OIM private key to Slurm share for node-to-node SSH - ansible.builtin.copy: - src: "{{ ssh_private_key_path }}" - dest: "{{ slurm_config_path }}/ssh/oim_rsa" - owner: root - group: root - mode: '0600' diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 63bb52fb41..a4717bd662 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -103,27 +103,31 @@ auth_tls_certs_path: "/opt/omnia/auth/tls_certs/ldapserver.crt" slurm_installation_type: configless pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" controller_empty_msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun discovery.yml." -download_container_image_path: "{{ slurm_config_path }}/hpc_tools/scripts/download_container_image.sh" -container_image_list_path: "{{ slurm_config_path }}/hpc_tools/scripts/container_image.list" -pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225" -packages_base_dir_x86_64: "{{ slurm_config_path }}/packages/x86_64" -packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64" -offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" -offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" -packages_layout_x86_64: - - doca-ofed - - cuda -packages_layout_aarch64: - - doca-ofed - - cuda -print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" -offline_path_x86_64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" -offline_path_aarch64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" +# nvidia sdk vars +nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" +nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz" +nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" -ssh_private_key_path: /root/.ssh/oim_rsa +# parallel file copy +parallel_copy_max_workers: 4 + +# ------------------------------------------------------------ +# Parallel Copy Candidates (Only path existence matters) +# ------------------------------------------------------------ + +parallel_copy_candidates: + + # CUDA Runfile (aarch64 repo path) + - name: cuda_runfile_aarch64 + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + + # CUDA Runfile (x86_64 repo path) + - name: cuda_runfile_x86_64 + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + + # NVIDIA HPC SDK (x86_64 tarball extracted dir) + - name: nvhpc_sdk_x86_64 + src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" + dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" From c4a95fa2a42a03a593eeddb7d1864f796f7bad77 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 28 Jan 2026 12:57:20 +0000 Subject: [PATCH 002/172] log path change Signed-off-by: sakshi-singla-1735 --- common/library/modules/parallel_file_copy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py index 4f05f041c3..8f46f5a881 100644 --- a/common/library/modules/parallel_file_copy.py +++ b/common/library/modules/parallel_file_copy.py @@ -37,6 +37,7 @@ DEFAULT_MAX_WORKERS = 4 DEFAULT_RETRY_COUNT = 2 DEFAULT_DELETE_EXISTING = True +PARALLEL_FILE_COPY_LOG = '/opt/omnia/log/core/playbooks/parallel_file_copy.log/' # ============================================================ # Copy Worker Function @@ -133,7 +134,7 @@ def main(): max_workers=dict(type="int", required=False, default=DEFAULT_MAX_WORKERS), retry_count=dict(type="int", required=False, default=DEFAULT_RETRY_COUNT), delete_existing=dict(type="bool", required=False, default=DEFAULT_DELETE_EXISTING), - slog_file=dict(type="str", required=False, default="/tmp/parallel_copy.log"), + slog_file=dict(type="str", required=False, default=PARALLEL_FILE_COPY_LOG), ) module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) From af25939cd7a68d1b94896c1c4c6d31177403590a Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 28 Jan 2026 13:01:39 +0000 Subject: [PATCH 003/172] missed code Signed-off-by: sakshi-singla-1735 --- .../slurm_config/tasks/create_slurm_dir.yml | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 18ee917fb7..45aa87d2d2 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -18,6 +18,34 @@ - name: Include storage vars ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml" +- name: Load slurm_custom.json for x86_64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_x86_64 + failed_when: false + +- name: Load slurm_custom.json for aarch64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_aarch64 + failed_when: false + +- name: Extract CUDA runfile name for x86_64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: + - slurm_custom_x86_64 is defined + - slurm_custom_x86_64.slurm_node is defined + - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 + +- name: Extract CUDA runfile name for aarch64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: + - slurm_custom_aarch64 is defined + - slurm_custom_aarch64.slurm_node is defined + - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 + - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" From 05ef165790335024e2c14b1ebc79378e75b1c9e6 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 28 Jan 2026 13:27:13 +0000 Subject: [PATCH 004/172] adding code to hpc_tools file Signed-off-by: sakshi-singla-1735 --- .../slurm_config/tasks/create_slurm_dir.yml | 382 +++++------------- .../roles/slurm_config/tasks/hpc_tools.yml | 49 +-- discovery/roles/slurm_config/vars/main.yml | 30 +- 3 files changed, 152 insertions(+), 309 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 45aa87d2d2..c8bdb5d335 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,314 +12,144 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Include variable file omnia_config.yml - ansible.builtin.include_vars: "{{ input_project_dir }}/omnia_config.yml" -- name: Include storage vars - ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml" - -- name: Load slurm_custom.json for x86_64 - ansible.builtin.include_vars: - file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" - name: slurm_custom_x86_64 - failed_when: false - -- name: Load slurm_custom.json for aarch64 - ansible.builtin.include_vars: - file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" - name: slurm_custom_aarch64 - failed_when: false - -- name: Extract CUDA runfile name for x86_64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_x86_64 is defined - - slurm_custom_x86_64.slurm_node is defined - - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - -- name: Extract CUDA runfile name for aarch64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_aarch64 is defined - - slurm_custom_aarch64.slurm_node is defined - - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - -- name: Set facts for slurm - ansible.builtin.set_fact: - nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" - -- name: Read the slurm mount point - ansible.builtin.set_fact: - share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" - nfs_server_ip: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_ip }}" - nfs_server_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_share_path }}" - -- name: Set facts for slurm - ansible.builtin.set_fact: - cluster_name: "{{ slurm_cluster[0].cluster_name }}" - configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}" - slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" - controller_trackfile_path: "{{ share_path }}/ctld_track" - -- name: Build parallel copy list for HPC tools - ansible.builtin.set_fact: - parallel_copy_pairs: [] - -- name: Configure openldap if supported - ansible.builtin.include_tasks: openldap_config.yml - when: hostvars['localhost']['openldap_support'] - -- name: Set facts for slurm - ansible.builtin.set_fact: - share_prefix: "{{ slurm_config_path }}" - when: conf_in_nfs - -- name: Clear the share directory - ansible.builtin.file: - path: "{{ slurm_config_path }}" - state: absent - when: clear_slurm_files - -- name: Create the slurm directory in share +- name: Create HPC tools directories on share ansible.builtin.file: - path: "{{ slurm_config_path }}" + path: "{{ slurm_config_path }}/hpc_tools/{{ item }}" state: directory owner: root group: root mode: "{{ common_mode }}" + loop: + - cuda + - runfile + - scripts + - container_images -# This directory is created to store the controller track file in NFS -# The track file is generated only after the Slurm controller has been fully configured in a fresh deployment -- name: Create directory for controller init track file in share - ansible.builtin.file: - path: "{{ controller_trackfile_path }}" - state: directory - owner: root - group: root - mode: "{{ common_mode }}" +- name: Deploy download_container_image.sh to NFS share + ansible.builtin.template: + src: "download_container_image.sh.j2" + dest: "{{ download_container_image_path }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "0755" + +- name: Deploy container_image.list to NFS share + ansible.builtin.template: + src: "container_image.list.j2" + dest: "{{ container_image_list_path }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "0644" + +- name: Set fact for pulp mirror + ansible.builtin.set_fact: + pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225" -- name: Create the slurm ctld directory on share +- name: Create x86_64 package base directory ansible.builtin.file: - path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}" + path: "{{ packages_base_dir_x86_64 }}" state: directory - owner: root - group: root - mode: "{{ common_mode }}" - when: ctld_list - loop: "{{ ctld_list | product(ctld_dir) }}" + mode: '{{ common_mode }}' -- name: Create the slurm cmpt directory on share +- name: Create aarch64 package base directory ansible.builtin.file: - path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}" + path: "{{ packages_base_dir_aarch64 }}" state: directory - owner: root - group: root - mode: "{{ common_mode }}" - when: cmpt_list or login_list or compiler_login_list - loop: "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}" + mode: '{{ common_mode }}' -- name: Create the cert directory on share +- name: Create x86_64 package layout directories ansible.builtin.file: - path: "{{ slurm_config_path }}/cert" + path: "{{ packages_base_dir_x86_64 }}/{{ item }}" state: directory - owner: root - group: root - mode: "{{ common_mode }}" - -- name: Copy pulp webserver certificate to client_share_path - ansible.builtin.copy: - src: "{{ pulp_webserver_cert_path }}" - dest: "{{ slurm_config_path }}/cert" - mode: "{{ file_mode }}" - become: true + mode: '{{ common_mode }}' + loop: "{{ packages_layout_x86_64 }}" -- name: Create HPC tools directories on share +- name: Create aarch64 package layout directories ansible.builtin.file: - path: "{{ slurm_config_path }}/hpc_tools/{{ item }}" + path: "{{ packages_base_dir_aarch64 }}/{{ item }}" state: directory - owner: root - group: root - mode: "{{ common_mode }}" - loop: - - cuda - - runfile - - scripts - - container_images - - nvidia_sdk - - benchmarks + mode: '{{ common_mode }}' + loop: "{{ packages_layout_aarch64 }}" -- name: Set NFS info fact - ansible.builtin.set_fact: - oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" +- name: Print copy paths for x86_64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_x86_64 | default([]) }}" -- name: Initialize parallel copy pairs - ansible.builtin.set_fact: - parallel_copy_pairs: [] +- name: Print copy paths for aarch64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_aarch64 | default([]) }}" -- name: Check which parallel copy source directories exist +- name: Check x86_64 offline package sources ansible.builtin.stat: - path: "{{ item.src }}" - loop: "{{ parallel_copy_candidates }}" - register: copy_source_checks - failed_when: false + path: "{{ item.source_path }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + register: x86_64_offline_pkg_sources -- name: Add only valid copy pairs (source exists) - ansible.builtin.set_fact: - parallel_copy_pairs: >- - {{ parallel_copy_pairs + - [[ item.item.src, item.item.dest ]] }} - loop: "{{ copy_source_checks.results }}" - when: item.stat.exists - -- name: Parallel copy HPC tool files - parallel_file_copy: - copy_pairs: "{{ parallel_copy_pairs }}" - max_workers: "{{ parallel_copy_max_workers }}" - when: parallel_copy_pairs | length > 0 - -- name: Check if munge key exists top level +- name: Check aarch64 offline package sources ansible.builtin.stat: - path: "{{ slurm_config_path }}/munge.key" - register: munge_present - -- name: Ensure munge key is generated - ansible.builtin.shell: "{{ munge_key_cmd }} > {{ slurm_config_path }}/munge.key" - when: not munge_present.stat.exists - register: munge_gen - changed_when: munge_gen.rc == 0 + path: "{{ item.source_path }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + register: aarch64_offline_pkg_sources -- name: Distribute the munge key +- name: Copy x86_64 offline packages ansible.builtin.copy: - src: "{{ slurm_config_path }}/munge.key" - dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key" - mode: "{{ common_mode }}" + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" remote_src: true - loop: "{{ (ctld_list | default([])) + - (cmpt_list | default([])) + - (compiler_login_list | default([])) + - (login_list | default([])) }}" - -- name: Slurm path ops - ansible.builtin.set_fact: - conf_path_items: "{{ conf_path_items | default({}) | combine({item.key: item.value}) }}" - when: item.value is string - loop: "{{ configs_input | dict2items }}" - -- name: Slurm dict ops - ansible.builtin.set_fact: - conf_dict_items: "{{ conf_dict_items | default({}) | combine({item.key: item.value}) }}" - when: item.value is mapping - loop: "{{ configs_input | dict2items }}" - -- name: Slurm dict ops - ansible.builtin.set_fact: - apply_config: >- - {{ apply_config | default({}) - | combine({ - item: ( - (__default_config[item] | default({})) - | combine(conf_dict_items[item] | default({})) - ) - }) - }} - loop: "{{ conf_files }}" - -- name: Read NodeName parameters - ansible.builtin.include_tasks: read_node_idrac.yml - when: cmpt_list - loop: "{{ cmpt_list }}" + mode: preserve + loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 -- name: Copy conf file if provided +- name: Copy aarch64 offline packages ansible.builtin.copy: - src: "{{ conf_path_items.get(item.1) }}" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" - mode: "{{ conf_file_mode }}" - remote_src: "{{ copy_from_oim }}" - when: ctld_list - loop: "{{ ctld_list | product(conf_path_items.keys() | default([])) }}" - -- name: Add gpu parameters to slurm conf - ansible.builtin.set_fact: - apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" - when: gpu_params is defined and gpu_params - -- name: Verify slurm conf keys only - ansible.builtin.assert: - that: - - (apply_config[item].keys() | list) | difference(__conf_keys[item]) | length == 0 - fail_msg: "The following {{ item }} config keys are invalid: {{ apply_config[item].keys() | list | difference(__conf_keys[item]) | join(', ') }}" - when: apply_config[item] and __conf_keys[item] - loop: "{{ conf_files }}" - -- name: Slurm dict ops - ansible.builtin.set_fact: - slurm_conf_dict: "{{ apply_config['slurm'] }}" - -- name: Create all .conf for ctld only - ansible.builtin.template: - src: "{{ item.1 }}.conf.j2" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" - when: ctld_list - loop: "{{ ctld_list | product(conf_files | difference(conf_path_items.keys() | default([]))) }}" - -- name: Create mariadb cnf - ansible.builtin.template: - src: "mariadb-server.cnf.j2" - dest: "{{ slurm_config_path }}/{{ item }}/etc/my.cnf.d/mariadb-server.cnf" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" - when: ctld_list - loop: "{{ ctld_list }}" + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 -- name: Generate slurmd opts for Configless +- name: Set NFS info fact ansible.builtin.set_fact: - conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (apply_config['slurm']['SlurmctldPort'] | string)) | join(',') }}" - -- name: Create epilog.sh and slurmd.service - ansible.builtin.template: - src: "{{ item.1 }}.j2" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" - when: cmpt_list - loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}" - -- name: Create slurmd.service in login and login_compiler - ansible.builtin.template: - src: "{{ item.1 }}.j2" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" - when: login_list or compiler_login_list - loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}" - -- name: Get the slurm NFS path - ansible.builtin.debug: - msg: "The slurm NFS path is {{ share_path }}/slurm" + oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" -- name: NFS path for cloud init - ansible.builtin.set_fact: - cloud_init_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/slurm" +- name: Check if source directory exists + ansible.builtin.stat: + path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" + register: src_dir_check_x86_64 -- name: NFS path for controller trackfile - ansible.builtin.set_fact: - trackfile_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/ctld_track" +- name: Check if source directory exists + ansible.builtin.stat: + path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" + register: src_dir_check_aarch64 -- name: NFS path for cloud init - ansible.builtin.set_fact: - cloud_init_nfs_path_openldap: "{{ nfs_server_ip }}:{{ nfs_server_path }}/openldap" - when: hostvars['localhost']['openldap_support'] +- name: Copy cuda run file using copy module for aarch64 + ansible.builtin.copy: + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + mode: '0755' + owner: root + group: root + directory_mode: '0755' + remote_src: true + when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir -# This will be mounted for ucx, openmpi and ldms configurations on slurm nodes -- name: NFS path for ucx, openmpi and ldms cloud init - ansible.builtin.set_fact: - cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}" - client_mount_path: "{{ share_path }}" +- name: Copy cuda run file using copy module for x86_64 + ansible.builtin.copy: + src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" + dest: "{{ slurm_config_path }}/hpc_tools/runfile/" + mode: '0755' + owner: root + group: root + directory_mode: '0755' + remote_src: true + when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir diff --git a/discovery/roles/slurm_config/tasks/hpc_tools.yml b/discovery/roles/slurm_config/tasks/hpc_tools.yml index c8bdb5d335..4eb511f80c 100644 --- a/discovery/roles/slurm_config/tasks/hpc_tools.yml +++ b/discovery/roles/slurm_config/tasks/hpc_tools.yml @@ -122,34 +122,27 @@ ansible.builtin.set_fact: oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" -- name: Check if source directory exists - ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_x86_64 +- name: Build parallel copy list for HPC tools + ansible.builtin.set_fact: + parallel_copy_pairs: [] -- name: Check if source directory exists +- name: Check which parallel copy source directories exist ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_aarch64 - -- name: Copy cuda run file using copy module for aarch64 - ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' - owner: root - group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir + path: "{{ item.src }}" + loop: "{{ parallel_copy_candidates }}" + register: copy_source_checks + failed_when: false -- name: Copy cuda run file using copy module for x86_64 - ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' - owner: root - group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir +- name: Add only valid copy pairs (source exists) + ansible.builtin.set_fact: + parallel_copy_pairs: >- + {{ parallel_copy_pairs + + [[ item.item.src, item.item.dest ]] }} + loop: "{{ copy_source_checks.results }}" + when: item.stat.exists + +- name: Parallel copy HPC tool files + parallel_file_copy: + copy_pairs: "{{ parallel_copy_pairs }}" + max_workers: "{{ parallel_copy_max_workers }}" + when: parallel_copy_pairs | length > 0 diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index a4717bd662..9b2ea90c89 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -103,10 +103,30 @@ auth_tls_certs_path: "/opt/omnia/auth/tls_certs/ldapserver.crt" slurm_installation_type: configless pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" controller_empty_msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun discovery.yml." -# nvidia sdk vars -nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" -nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz" -nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" +download_container_image_path: "{{ slurm_config_path }}/hpc_tools/scripts/download_container_image.sh" +container_image_list_path: "{{ slurm_config_path }}/hpc_tools/scripts/container_image.list" +pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225" +packages_base_dir_x86_64: "{{ slurm_config_path }}/packages/x86_64" +packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64" +offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" +offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" +packages_layout_x86_64: + - doca-ofed + - cuda +packages_layout_aarch64: + - doca-ofed + - cuda +print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" +offline_path_x86_64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" +offline_path_aarch64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" + +ssh_private_key_path: /root/.ssh/oim_rsa # parallel file copy parallel_copy_max_workers: 4 From 166cf2990555c6f092dad25f33722d796bf186bf Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 28 Jan 2026 15:26:30 +0000 Subject: [PATCH 005/172] updating vars Signed-off-by: sakshi-singla-1735 --- .../slurm_config/tasks/create_slurm_dir.yml | 284 +++++++++++------- discovery/roles/slurm_config/vars/main.yml | 6 +- 2 files changed, 184 insertions(+), 106 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index c8bdb5d335..662802274b 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -12,144 +12,222 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Include variable file omnia_config.yml + ansible.builtin.include_vars: "{{ input_project_dir }}/omnia_config.yml" -- name: Create HPC tools directories on share +- name: Include storage vars + ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml" + +- name: Load slurm_custom.json for x86_64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_x86_64 + failed_when: false + +- name: Load slurm_custom.json for aarch64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_aarch64 + failed_when: false + +- name: Extract CUDA runfile name for x86_64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: + - slurm_custom_x86_64 is defined + - slurm_custom_x86_64.slurm_node is defined + - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 + +- name: Extract CUDA runfile name for aarch64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: + - slurm_custom_aarch64 is defined + - slurm_custom_aarch64.slurm_node is defined + - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 + +- name: Set facts for slurm + ansible.builtin.set_fact: + nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" + +- name: Read the slurm mount point + ansible.builtin.set_fact: + share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" + nfs_server_ip: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_ip }}" + nfs_server_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_share_path }}" + +- name: Set facts for slurm + ansible.builtin.set_fact: + cluster_name: "{{ slurm_cluster[0].cluster_name }}" + configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}" + slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" + controller_trackfile_path: "{{ share_path }}/ctld_track" + +- name: Configure openldap if supported + ansible.builtin.include_tasks: openldap_config.yml + when: hostvars['localhost']['openldap_support'] + +- name: Set facts for slurm + ansible.builtin.set_fact: + share_prefix: "{{ slurm_config_path }}" + when: conf_in_nfs + +- name: Clear the share directory + ansible.builtin.file: + path: "{{ slurm_config_path }}" + state: absent + when: clear_slurm_files + +- name: Create the slurm directory in share ansible.builtin.file: - path: "{{ slurm_config_path }}/hpc_tools/{{ item }}" + path: "{{ slurm_config_path }}" state: directory owner: root group: root mode: "{{ common_mode }}" - loop: - - cuda - - runfile - - scripts - - container_images -- name: Deploy download_container_image.sh to NFS share - ansible.builtin.template: - src: "download_container_image.sh.j2" - dest: "{{ download_container_image_path }}" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "0755" - -- name: Deploy container_image.list to NFS share - ansible.builtin.template: - src: "container_image.list.j2" - dest: "{{ container_image_list_path }}" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "0644" - -- name: Set fact for pulp mirror - ansible.builtin.set_fact: - pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225" - -- name: Create x86_64 package base directory +# This directory is created to store the controller track file in NFS +# The track file is generated only after the Slurm controller has been fully configured in a fresh deployment +- name: Create directory for controller init track file in share ansible.builtin.file: - path: "{{ packages_base_dir_x86_64 }}" + path: "{{ controller_trackfile_path }}" state: directory - mode: '{{ common_mode }}' + owner: root + group: root + mode: "{{ common_mode }}" -- name: Create aarch64 package base directory +- name: Create the slurm ctld directory on share ansible.builtin.file: - path: "{{ packages_base_dir_aarch64 }}" + path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}" state: directory - mode: '{{ common_mode }}' + owner: root + group: root + mode: "{{ common_mode }}" + when: ctld_list + loop: "{{ ctld_list | product(ctld_dir) }}" -- name: Create x86_64 package layout directories +- name: Create the slurm cmpt directory on share ansible.builtin.file: - path: "{{ packages_base_dir_x86_64 }}/{{ item }}" + path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}" state: directory - mode: '{{ common_mode }}' - loop: "{{ packages_layout_x86_64 }}" + owner: root + group: root + mode: "{{ common_mode }}" + when: cmpt_list or login_list or compiler_login_list + loop: "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}" -- name: Create aarch64 package layout directories +- name: Create the cert directory on share ansible.builtin.file: - path: "{{ packages_base_dir_aarch64 }}/{{ item }}" + path: "{{ slurm_config_path }}/cert" state: directory - mode: '{{ common_mode }}' - loop: "{{ packages_layout_aarch64 }}" + owner: root + group: root + mode: "{{ common_mode }}" -- name: Print copy paths for x86_64 - ansible.builtin.debug: - msg: "{{ print_copy_msg }}" - loop: "{{ offline_path_x86_64 | default([]) }}" +- name: Copy pulp webserver certificate to client_share_path + ansible.builtin.copy: + src: "{{ pulp_webserver_cert_path }}" + dest: "{{ slurm_config_path }}/cert" + mode: "{{ file_mode }}" + become: true -- name: Print copy paths for aarch64 - ansible.builtin.debug: - msg: "{{ print_copy_msg }}" - loop: "{{ offline_path_aarch64 | default([]) }}" +- name: Create hpc tools dirs + ansible.builtin.include_tasks: hpc_tools.yml -- name: Check x86_64 offline package sources +- name: Check if munge key exists top level ansible.builtin.stat: - path: "{{ item.source_path }}" - loop: "{{ offline_path_x86_64 | default([]) }}" - register: x86_64_offline_pkg_sources + path: "{{ slurm_config_path }}/munge.key" + register: munge_present -- name: Check aarch64 offline package sources - ansible.builtin.stat: - path: "{{ item.source_path }}" - loop: "{{ offline_path_aarch64 | default([]) }}" - register: aarch64_offline_pkg_sources +- name: Ensure munge key is generated + ansible.builtin.shell: "{{ munge_key_cmd }} > {{ slurm_config_path }}/munge.key" + when: not munge_present.stat.exists + register: munge_gen + changed_when: munge_gen.rc == 0 -- name: Copy x86_64 offline packages +- name: Distribute the munge key ansible.builtin.copy: - src: "{{ item.item.source_path }}/" - dest: "{{ item.item.dest_path }}/" + src: "{{ slurm_config_path }}/munge.key" + dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key" + mode: "{{ common_mode }}" remote_src: true - mode: preserve - loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}" - when: - - item.stat.exists - - item.item.source_path | length > 0 - - item.item.dest_path | length > 0 + loop: "{{ (ctld_list | default([])) + + (cmpt_list | default([])) + + (compiler_login_list | default([])) + + (login_list | default([])) }}" -- name: Copy aarch64 offline packages - ansible.builtin.copy: - src: "{{ item.item.source_path }}/" - dest: "{{ item.item.dest_path }}/" - remote_src: true - mode: preserve - loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}" - when: - - item.stat.exists - - item.item.source_path | length > 0 - - item.item.dest_path | length > 0 +- name: Conf merge and write using slurm_conf module + ansible.builtin.include_tasks: confs.yml + +- name: Create mariadb cnf + ansible.builtin.template: + src: "mariadb-server.cnf.j2" + dest: "{{ slurm_config_path }}/{{ item }}/etc/my.cnf.d/mariadb-server.cnf" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "{{ conf_file_mode }}" + when: ctld_list + loop: "{{ ctld_list }}" -- name: Set NFS info fact +- name: Generate slurmd opts for Configless ansible.builtin.set_fact: - oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" + conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (apply_config['slurm']['SlurmctldPort'] | string)) | join(',') }}" -- name: Check if source directory exists - ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_x86_64 +- name: Create epilog.sh and slurmd.service + ansible.builtin.template: + src: "{{ item.1 }}.j2" + dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "{{ conf_file_mode }}" + when: cmpt_list + loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}" -- name: Check if source directory exists - ansible.builtin.stat: - path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - register: src_dir_check_aarch64 +- name: Create slurmd.service in login and login_compiler + ansible.builtin.template: + src: "{{ item.1 }}.j2" + dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "{{ conf_file_mode }}" + when: login_list or compiler_login_list + loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}" -- name: Copy cuda run file using copy module for aarch64 - ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' +- name: Get the slurm NFS path + ansible.builtin.debug: + msg: "The slurm NFS path is {{ share_path }}/slurm" + +- name: NFS path for cloud init + ansible.builtin.set_fact: + cloud_init_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/slurm" + +- name: NFS path for controller trackfile + ansible.builtin.set_fact: + trackfile_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}/ctld_track" + +- name: NFS path for cloud init + ansible.builtin.set_fact: + cloud_init_nfs_path_openldap: "{{ nfs_server_ip }}:{{ nfs_server_path }}/openldap" + when: hostvars['localhost']['openldap_support'] + +# This will be mounted for ucx, openmpi and ldms configurations on slurm nodes +- name: NFS path for ucx, openmpi and ldms cloud init + ansible.builtin.set_fact: + cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}" + client_mount_path: "{{ share_path }}" + +- name: Ensure SSH key directory exists on Slurm share + ansible.builtin.file: + path: "{{ slurm_config_path }}/ssh" + state: directory owner: root group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_aarch64.stat.exists and src_dir_check_aarch64.stat.isdir + mode: '0700' -- name: Copy cuda run file using copy module for x86_64 +- name: Copy OIM private key to Slurm share for node-to-node SSH ansible.builtin.copy: - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - mode: '0755' + src: "{{ ssh_private_key_path }}" + dest: "{{ slurm_config_path }}/ssh/oim_rsa" owner: root group: root - directory_mode: '0755' - remote_src: true - when: src_dir_check_x86_64.stat.exists and src_dir_check_x86_64.stat.isdir + mode: '0600' diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 9b2ea90c89..23434e6765 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -148,6 +148,6 @@ parallel_copy_candidates: dest: "{{ slurm_config_path }}/hpc_tools/runfile/" # NVIDIA HPC SDK (x86_64 tarball extracted dir) - - name: nvhpc_sdk_x86_64 - src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" - dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" + # - name: nvhpc_sdk_x86_64 + # src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" + # dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" From 87a022ab9746bb5ddcd559db83fce1a3f5ab6843 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 29 Jan 2026 13:58:51 +0530 Subject: [PATCH 006/172] openmpi ucx template approach --- .vscode/.checkmarxIgnored | 1 + ...i-group-login_compiler_node_x86_64.yaml.j2 | 82 ++++++------------- .../ci-group-slurm_node_x86_64.yaml.j2 | 12 +++ .../hpc_tools/configure_ucx_openmpi_env.sh.j2 | 56 +++++++++++++ .../templates/hpc_tools/install_openmpi.sh.j2 | 73 +++++++++++++++++ .../templates/hpc_tools/install_ucx.sh.j2 | 55 +++++++++++++ 6 files changed, 222 insertions(+), 57 deletions(-) create mode 100644 .vscode/.checkmarxIgnored create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 diff --git a/.vscode/.checkmarxIgnored b/.vscode/.checkmarxIgnored new file mode 100644 index 0000000000..9e26dfeeb6 --- /dev/null +++ b/.vscode/.checkmarxIgnored @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 3195fad9e3..cc8586193f 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -190,6 +190,18 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + - path: /usr/local/bin/install_openmpi.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_openmpi.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_ucx.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} + - path: /etc/hosts append: true content: | @@ -299,66 +311,22 @@ {% endif %} {% if hostvars['localhost']['ucx_support'] %} - # UCX build and install - - | - UCX_BIN={{ client_mount_path }}/benchmarks/ucx - mkdir -p {{ client_mount_path }}/compile/ucx - mkdir -p {{ client_mount_path }}/benchmarks/ucx - cd {{ client_mount_path }}/compile/ucx - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz -O ucx.tar.gz - tar xzf ucx.tar.gz - cd ucx-* - mkdir -p build - cd build - ../contrib/configure-release --prefix={{ client_mount_path }}/benchmarks/ucx - make -j 8 - make install + - echo "===== UCX Setup =====" + - echo "UCX support is enabled." + - /usr/local/bin/install_ucx.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_ucx.sh" + # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} {% if hostvars['localhost']['openmpi_support'] %} - # OpenMPI build and install with UCX + Slurm detection - - | - OPENMPI_INSTALL_PREFIX="{{ client_mount_path }}/benchmarks/openmpi" - OPENMPI_SRC="{{ client_mount_path }}/compile/openmpi" - mkdir -p $OPENMPI_SRC - mkdir -p $OPENMPI_INSTALL_PREFIX - - cd $OPENMPI_SRC - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz -O openmpi.tar.gz - - tar xzf openmpi.tar.gz - cd openmpi-* - mkdir -p build - - # Check Slurm - if sinfo >/dev/null 2>&1; then - SLURM_FLAG="--with-slurm=yes --with-munge=/usr" - else - SLURM_FLAG="--with-slurm=no" - fi - - # Check UCX - if [ -x "{{ client_mount_path }}/benchmarks/ucx/bin/ucx_info" ]; then - {{ client_mount_path }}/benchmarks/ucx/bin/ucx_info -v - if [ $? -eq 0 ]; then - UCX_FLAG="--with-ucx={{ client_mount_path }}/benchmarks/ucx" - else - echo "ucx_info failed, disabling UCX" - UCX_FLAG="" - fi - else - echo "ucx_info not found, disabling UCX" - UCX_FLAG="" - fi - - cd build - ../configure --prefix=$OPENMPI_INSTALL_PREFIX \ - --enable-mpi1-compatibility \ - --enable-prte-prefix-by-default \ - $SLURM_FLAG $UCX_FLAG 2>&1 | tee config.out - - make -j 8 - make install + - echo "===== OpenMPI Setup =====" + - echo "OpenMPI support is enabled." + - /usr/local/bin/install_openmpi.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_openmpi.sh" + # - echo "Run UCX installation first if UCX support is enabled." + # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} {% if hostvars['localhost']['ldms_support'] %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 80347f6854..27eb60456e 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -455,6 +455,18 @@ - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - mount -a + - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + - echo "Shared NFS mount is available at: {{ client_mount_path }}" + - /usr/local/bin/configure_ucx_openmpi_env.sh + # - echo "" + # - echo "IMPORTANT:" + # - echo "1. Install UCX and/or OpenMPI on the LOGIN / COMPILER node first." + # - echo "2. Ensure they are installed under the shared mount:" + # - echo " {{ client_mount_path }}/hpc_tools/benchmarks/" + # - echo "3. On this node, run the environment setup script when ready:" + # - echo "" + # - echo "This step is intentionally NOT run automatically." + - echo "==================================================" {% endif %} {% if hostvars['localhost']['ldms_support'] %} diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 new file mode 100644 index 0000000000..4064eddbb1 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 @@ -0,0 +1,56 @@ +#!/bin/bash +LOGFILE="/var/log/configure_ucx_openmpi_env.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Configuring UCX / OpenMPI environment (Slurm node) =====" + +CLIENT_MOUNT="{{ client_mount_path }}" +UCX_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" +OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi" + +PROFILE_DIR="/etc/profile.d" + +# Ensure client mount exists and is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[WARN] $CLIENT_MOUNT is not mounted. Skipping UCX/OpenMPI env setup." + exit 0 +fi + +# ---------------- UCX ---------------- +if [ -d "$UCX_PREFIX/bin" ]; then + echo "[INFO] UCX detected at $UCX_PREFIX" + + cat > "$PROFILE_DIR/ucx.sh" < "$PROFILE_DIR/openmpi.sh" <> "$OPENMPI_PREFIX/openmpi_tar_output.log" 2>&1 +else + echo "openmpi.tar.gz already exists, skipping download." \ + >> "$OPENMPI_PREFIX/openmpi_tar_output.log" +fi + +tar xzf openmpi.tar.gz +cd openmpi-* +mkdir -p build + +# Slurm detection +if sinfo >/dev/null 2>&1; then + SLURM_FLAG="--with-slurm=yes --with-munge=/usr" +else + SLURM_FLAG="--with-slurm=no" +fi + +# UCX detection +if [ -x "{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx/bin/ucx_info" ]; then + UCX_FLAG="--with-ucx={{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" +else + UCX_FLAG="" +fi + +cd build +../configure --prefix="$OPENMPI_PREFIX" \ + --enable-mpi1-compatibility \ + --enable-prte-prefix-by-default \ + $SLURM_FLAG $UCX_FLAG + +make -j {{ openmpi_build_threads | default(8) }} +make install + +# Configure OpenMPI environment variables system-wide +OPENMPI_ENV_FILE="/etc/profile.d/openmpi.sh" + +cat > "$OPENMPI_ENV_FILE" <> "$UCX_PREFIX/ucx_tar_output.log" 2>&1 +else + echo "ucx.tar.gz already exists, skipping download." \ + >> "$UCX_PREFIX/ucx_tar_output.log" +fi + +tar xzf ucx.tar.gz +cd ucx-* +mkdir -p build +cd build + +../contrib/configure-release --prefix="$UCX_PREFIX" +make -j {{ ucx_build_threads | default(8) }} +make install + +# Configure UCX environment variables system-wide +UCX_ENV_FILE="/etc/profile.d/ucx.sh" + +cat > "$UCX_ENV_FILE" < Date: Thu, 29 Jan 2026 14:04:04 +0530 Subject: [PATCH 007/172] adding template call --- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 27eb60456e..4dfc45e213 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -408,6 +408,12 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/configure_ucx_openmpi_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh From 721712b598d66020127edd66d21cb1559a3d0574 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 29 Jan 2026 14:05:56 +0530 Subject: [PATCH 008/172] deleting .vscode folder --- .vscode/.checkmarxIgnored | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .vscode/.checkmarxIgnored diff --git a/.vscode/.checkmarxIgnored b/.vscode/.checkmarxIgnored deleted file mode 100644 index 9e26dfeeb6..0000000000 --- a/.vscode/.checkmarxIgnored +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file From 2f59b34af122cc0d07573d91ff6152dffbb7b7bb Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 29 Jan 2026 14:15:03 +0530 Subject: [PATCH 009/172] copyright update --- common/library/modules/parallel_file_copy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/library/modules/parallel_file_copy.py b/common/library/modules/parallel_file_copy.py index 8f46f5a881..a697764683 100644 --- a/common/library/modules/parallel_file_copy.py +++ b/common/library/modules/parallel_file_copy.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 3e773ece56757ed90b27df51cba9369c384c5bd6 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 29 Jan 2026 16:23:17 +0530 Subject: [PATCH 010/172] nvidia hpc sdk changes wrt template approach --- .../library/module_utils/local_repo/config.py | 2 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 16 ++++ .../ci-group-slurm_node_x86_64.yaml.j2 | 14 ++++ .../hpc_tools/configure_nvhpc_env.sh.j2 | 71 ++++++++++++++++++ .../hpc_tools/export_nvhpc_env.sh.j2 | 73 ++++++++++++++++++ .../hpc_tools/install_nvhpc_sdk.sh.j2 | 75 +++++++++++++++++++ .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2 | 71 ++++++++++++++++++ .../roles/slurm_config/tasks/hpc_tools.yml | 1 + discovery/roles/slurm_config/vars/main.yml | 11 ++- .../config/x86_64/rhel/10.0/slurm_custom.json | 5 ++ 10 files changed, 335 insertions(+), 4 deletions(-) create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 create mode 100644 discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 9c9af639fb..60debc51e3 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -81,7 +81,7 @@ } CLI_FILE_PATH = "/root/.config/pulp/cli.toml" POST_TIMEOUT = 3600 -TAR_POLL_VAL = 3 +TAR_POLL_VAL = 25 FILE_POLL_VAL = 1 ISO_POLL_VAL = 15 FILE_URI = "/pulp/api/v3/content/file/files/" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index cc8586193f..245f2a7fb4 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -219,6 +219,18 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/setup_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/setup_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -334,4 +346,8 @@ - /root/ldms_sampler.sh {% endif %} + + # nvidia sdk install + - /usr/local/bin/install_nvhpc_sdk.sh + - /usr/local/bin/configure_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 4dfc45e213..06c4a1d413 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -415,6 +415,18 @@ content: | {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }} + - path: /usr/local/bin/setup_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/setup_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/export_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/export_nvhpc_env.sh.j2') | indent(12) }} + runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_nvidia_driver.sh @@ -480,4 +492,6 @@ - /root/ldms_sampler.sh {% endif %} + - /usr/local/bin/setup_nvhpc_sdk.sh + - /usr/local/bin/export_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 new file mode 100644 index 0000000000..3c7efbc88b --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 @@ -0,0 +1,71 @@ +#!/bin/bash +set -e + +LOGFILE="/var/log/nvhpc_env_config.log" +exec >> "$LOGFILE" 2>&1 + +echo "===== Configuring NVIDIA HPC SDK environment =====" + +# Cloud-init safe defaults +export HOME=/root + +NVCOMPILERS="{{ nvhpc_local_mount | default('/opt/nvidia/nvhpc') }}" +NVARCH="$(uname -s)_$(uname -m)" +NVHPC_VERSION="{{ nvhpc_version | default('25.11') }}" + +NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION" +PROFILE_FILE="/etc/profile.d/nvhpc.sh" + +if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then + echo "[ERROR] NVHPC compilers not found at $NVHPC_BASE" + exit 1 +fi + +echo "[INFO] NVHPC detected at $NVHPC_BASE" +echo "[INFO] Writing persistent environment to $PROFILE_FILE" + +cat << EOF > "$PROFILE_FILE" +# NVIDIA HPC SDK environment +export NVCOMPILERS=$NVCOMPILERS +export NVARCH=$NVARCH +export NVHPC_VERSION=$NVHPC_VERSION + +export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/bin:\$PATH +export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/man + +# MPI (optional but recommended) +export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/bin:\$PATH +export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/man + +# Modules support (optional) +export MODULEPATH=\$NVCOMPILERS/modulefiles:\${MODULEPATH:-} +EOF + +chmod 644 "$PROFILE_FILE" + +# Source profile for current shell and all future non-login shells +if [ -f "$PROFILE_FILE" ]; then + echo "[INFO] Sourcing NVHPC profile for current shell" + source "$PROFILE_FILE" + grep -q "nvhpc.sh" /etc/bashrc || echo "source $PROFILE_FILE" >> /etc/bashrc +fi + +# NVHPC marker file path +MARKER_TARGET="{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}/.nvhpc_env_ready" + +if ! grep -q "{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" /etc/fstab; then + echo "[ERROR] NVHPC NFS path not found in /etc/fstab" + exit 1 +fi + +echo "[INFO] NVHPC NFS entry found in /etc/fstab" + +if [ ! -d "{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" ]; then + echo "[ERROR] Marker directory missing: {{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" + exit 1 +fi + +touch "$MARKER_TARGET" +echo "[SUCCESS] NVHPC marker created: $MARKER_TARGET" + +echo "===== NVHPC environment configuration completed successfully =====" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 new file mode 100644 index 0000000000..20e3bb0e5f --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 @@ -0,0 +1,73 @@ +#!/bin/bash +set -e + +CLIENT_MOUNT="{{ client_mount_path }}" + +NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" +NVARCH="$(uname -s)_$(uname -m)" +NVHPC_VERSION="25.11" + +NVHPC_BASE="$NVHPC_LOCAL_MOUNT/$NVARCH/$NVHPC_VERSION" +PROFILE_FILE="/etc/profile.d/nvhpc.sh" +LOGFILE="/var/log/export_nvhpc_env.log" + +# Log everything +exec > >(tee -a "$LOGFILE") 2>&1 + +# Check that NFS is mounted +if ! mountpoint -q "$CLIENT_MOUNT"; then + echo "[ERROR] $CLIENT_MOUNT is not mounted." + echo " Please mount the NFS path before running export_nvhpc_env.sh" + exit 1 +fi + +echo "===== NVHPC environment export started =====" + +# Validate compilers directory exists +if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then + echo "[ERROR] NVHPC compilers not found at:" + echo " $NVHPC_BASE/compilers/bin" + exit 1 +fi + +echo "[INFO] Writing persistent NVHPC profile at $PROFILE_FILE" + +# Write environment file system-wide +cat > "$PROFILE_FILE" </dev/null"; then + echo "[ERROR] nvc verification failed" + exit 1 +fi + +# Verify nvfortran +if ! bash -lc "command -v nvfortran && nvfortran --version >/dev/null"; then + echo "[ERROR] nvfortran verification failed" + exit 1 +fi + +echo "[SUCCESS] NVHPC environment exported successfully" +echo "[INFO] Environment file configured in $PROFILE_FILE" +echo "===== NVHPC export completed =====" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 new file mode 100644 index 0000000000..bdf0e263d7 --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 @@ -0,0 +1,75 @@ +#!/bin/bash +set -e + +LOGFILE="/var/log/nvhpc_sdk_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA HPC SDK installation =====" + +NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}" +NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" +NVHPC_MOUNT="/shared-nvhpc-sdk" +NVHPC_TARBALL="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}.tar.gz" +NVHPC_INSTALL_DIR_NFS="{{ NVHPC_MOUNT }}/nvhpc" +NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" +NVHPC_EXTRACT_DIR="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}" + +# Skip if already mounted +if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation." + exit 0 +fi + +# Skip if local directory exists +if [ -d "$NVHPC_LOCAL_MOUNT" ]; then + echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping." + exit 0 +fi + +mkdir -p "$NVHPC_MOUNT" +mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT" + +# Check tarball +echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..." +if [ ! -f "$NVHPC_TARBALL" ]; then + echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation." + exit 0 +fi + +# Extract if needed +EXTRACT_SIZE_GB=$(du -sBG "$NVHPC_EXTRACT_DIR" 2>/dev/null | cut -f1 | tr -d 'G') +if [ -d "$NVHPC_EXTRACT_DIR" ] && [ "$EXTRACT_SIZE_GB" -ge 13 ] && [ -f "$NVHPC_EXTRACT_DIR/install" ]; then + echo "[INFO] NVHPC already extracted. Skipping." +else + echo "[INFO] Extracting NVIDIA HPC SDK tarball..." + tar -xzf "$NVHPC_TARBALL" -C "$NVHPC_MOUNT" \ + --checkpoint=2000 \ + --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait" +fi + +mkdir -p "$NVHPC_INSTALL_DIR_NFS" +INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_x86_64/25.11/compilers/bin" + +if [ -x "$INSTALL_BIN_DIR/nvc" ]; then + echo "[INFO] NVHPC already installed. Skipping installer." +else + echo "[INFO] Running NVIDIA HPC SDK installer..." + cd "$NVHPC_EXTRACT_DIR" + NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install +fi + +echo "[SUCCESS] NVIDIA HPC SDK installation completed." + +# Mount NVHPC locally +mkdir -p "$NVHPC_LOCAL_MOUNT" +NVHPC_INSTALL_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" +FSTAB_ENTRY="$NVHPC_INSTALL_EXPORT $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" + +if ! grep -qE "^[^#].*$NVHPC_INSTALL_EXPORT[[:space:]]+$NVHPC_LOCAL_MOUNT[[:space:]]+nfs" /etc/fstab; then + echo "[INFO] Adding NVHPC mount to /etc/fstab" + echo "$FSTAB_ENTRY" >> /etc/fstab +fi + +echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..." +mount "$NVHPC_LOCAL_MOUNT" +echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 new file mode 100644 index 0000000000..e81049e57c --- /dev/null +++ b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 @@ -0,0 +1,71 @@ + - path: /usr/local/bin/setup_nvhpc_sdk.sh + permissions: '0755' + content: | + #!/bin/bash + LOGFILE="/var/log/setup_nvhpc_sdk.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + echo "===== NVHPC SDK setup (mount + wait) =====" + + PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" + PARENT_MOUNT="/shared-nvhpc-sdk" + + NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" + NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" + + NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready" + + WAIT_TIMEOUT=3600 + SLEEP_INTERVAL=20 + ELAPSED=0 + + # 1. Mount parent export + mkdir -p "$PARENT_MOUNT" + + if ! mountpoint -q "$PARENT_MOUNT"; then + mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT" + fi + + if ! mountpoint -q "$PARENT_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC parent export" + exit 1 + fi + + echo "[INFO] Parent NVHPC export mounted" + + # 2. Wait for readiness marker + echo "[INFO] Waiting for NVHPC readiness marker..." + + while [ ! -f "$NVHPC_MARKER" ]; do + if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then + echo "[ERROR] Timeout waiting for NVHPC readiness marker" + exit 1 + fi + sleep "$SLEEP_INTERVAL" + ELAPSED=$((ELAPSED + SLEEP_INTERVAL)) + done + + echo "[SUCCESS] NVHPC readiness marker detected" + + # 3. Ensure fstab entry exists + if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then + echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab + echo "[INFO] NVHPC fstab entry added" + else + echo "[INFO] NVHPC fstab entry already present" + fi + + # 4. Mount NVHPC SDK + mkdir -p "$NVHPC_LOCAL_MOUNT" + + if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + mount "$NVHPC_LOCAL_MOUNT" + fi + + if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC SDK" + exit 1 + fi + + echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT" + echo "===== NVHPC setup completed =====" \ No newline at end of file diff --git a/discovery/roles/slurm_config/tasks/hpc_tools.yml b/discovery/roles/slurm_config/tasks/hpc_tools.yml index 4eb511f80c..46260da267 100644 --- a/discovery/roles/slurm_config/tasks/hpc_tools.yml +++ b/discovery/roles/slurm_config/tasks/hpc_tools.yml @@ -25,6 +25,7 @@ - runfile - scripts - container_images + - nvidia_sdk - name: Deploy download_container_image.sh to NFS share ansible.builtin.template: diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 23434e6765..201c98dded 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -128,6 +128,11 @@ offline_path_aarch64: ssh_private_key_path: /root/.ssh/oim_rsa +# nvidia sdk vars +nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" +nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz" +nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" + # parallel file copy parallel_copy_max_workers: 4 @@ -148,6 +153,6 @@ parallel_copy_candidates: dest: "{{ slurm_config_path }}/hpc_tools/runfile/" # NVIDIA HPC SDK (x86_64 tarball extracted dir) - # - name: nvhpc_sdk_x86_64 - # src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" - # dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" + - name: nvhpc_sdk_x86_64 + src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" + dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" \ No newline at end of file diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 9531239fd2..ecf628883b 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -34,6 +34,11 @@ {"package": "cuda-run", "type": "iso", "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run" + }, + { + "package": "nvhpc_2025_2511_Linux_x86_64_cuda_13.0", + "type": "tarball", + "url": "https://developer.download.nvidia.com/hpc-sdk/25.11/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz" } ] }, From 18bffc836819f74038993f897a37f1569c23b5c4 Mon Sep 17 00:00:00 2001 From: mcas Date: Thu, 29 Jan 2026 16:24:48 +0530 Subject: [PATCH 011/172] filename chnage --- .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 245f2a7fb4..08cc8b79dd 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -220,13 +220,13 @@ content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} - - path: /usr/local/bin/setup_nvhpc_sdk.sh + - path: /usr/local/bin/install_nvhpc_sdk.sh owner: root:root permissions: '{{ file_mode_755 }}' content: | {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }} - - path: /usr/local/bin/setup_nvhpc_sdk.sh + - path: /usr/local/bin/configure_nvhpc_env.sh owner: root:root permissions: '{{ file_mode_755 }}' content: | From ba185055ca7098697e27d3ec08bf60b568d739d1 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Thu, 29 Jan 2026 11:00:18 +0000 Subject: [PATCH 012/172] ansible lint --- discovery/roles/slurm_config/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 201c98dded..4c0f558acd 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -155,4 +155,4 @@ parallel_copy_candidates: # NVIDIA HPC SDK (x86_64 tarball extracted dir) - name: nvhpc_sdk_x86_64 src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" - dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" \ No newline at end of file + dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" From 351e4c406b9079e3ebea7bb224df01e3bbecba5d Mon Sep 17 00:00:00 2001 From: mcas Date: Tue, 3 Feb 2026 11:25:01 +0530 Subject: [PATCH 013/172] cuda path changes --- .../cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 | 2 +- .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 | 2 +- .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 2 +- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index de236ed958..bc3068843a 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -98,7 +98,7 @@ echo "[INFO] Setting up shared CUDA directory..." # Create and mount shared directory for compute nodes mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 08cc8b79dd..a1f8a55f50 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -105,7 +105,7 @@ echo "[INFO] Setting up shared CUDA directory..." # Create and mount shared directory for compute nodes mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit if [ $? -ne 0 ]; then echo "[ERROR] Failed to mount NFS cuda share. Exiting." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index cc784bdd10..9b3ac1a501 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -127,7 +127,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 06c4a1d413..67a300c0f7 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -135,7 +135,7 @@ # Create mount point mkdir -p /usr/local/cuda - cuda_nfs_share="{{ cloud_init_nfs_path }}/cuda" + cuda_nfs_share="{{ cloud_init_nfs_path }}/hpc_tools/cuda" echo "[INFO] Mounting CUDA toolkit from NFS: $cuda_nfs_share" mount -t nfs "$cuda_nfs_share" /usr/local/cuda From 2cea76e97a4ce6e892de63b1f51898ad9fdca819 Mon Sep 17 00:00:00 2001 From: mcas Date: Tue, 3 Feb 2026 16:49:45 +0530 Subject: [PATCH 014/172] changing variable call --- .../hpc_tools/install_nvhpc_sdk.sh.j2 | 6 +- .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2 | 105 +++++++++--------- 2 files changed, 54 insertions(+), 57 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 index bdf0e263d7..26f3fd1775 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 @@ -9,10 +9,10 @@ echo "===== Starting NVIDIA HPC SDK installation =====" NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}" NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" NVHPC_MOUNT="/shared-nvhpc-sdk" -NVHPC_TARBALL="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}.tar.gz" -NVHPC_INSTALL_DIR_NFS="{{ NVHPC_MOUNT }}/nvhpc" +NVHPC_TARBALL="$NVHPC_MOUNT/${NVHPC_PKG_NAME}.tar.gz" +NVHPC_INSTALL_DIR_NFS="$NVHPC_MOUNT/nvhpc" NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" -NVHPC_EXTRACT_DIR="{{ NVHPC_MOUNT }}/{{ NVHPC_PKG_NAME }}" +NVHPC_EXTRACT_DIR="$NVHPC_MOUNT/${NVHPC_PKG_NAME}" # Skip if already mounted if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 index e81049e57c..b57061cd08 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 @@ -1,71 +1,68 @@ - - path: /usr/local/bin/setup_nvhpc_sdk.sh - permissions: '0755' - content: | - #!/bin/bash - LOGFILE="/var/log/setup_nvhpc_sdk.log" - exec > >(tee -a "$LOGFILE") 2>&1 +#!/bin/bash +LOGFILE="/var/log/setup_nvhpc_sdk.log" +exec > >(tee -a "$LOGFILE") 2>&1 - echo "===== NVHPC SDK setup (mount + wait) =====" +echo "===== NVHPC SDK setup (mount + wait) =====" - PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" - PARENT_MOUNT="/shared-nvhpc-sdk" +PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" +PARENT_MOUNT="/shared-nvhpc-sdk" - NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" - NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" +NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" +NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" - NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready" +NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready" - WAIT_TIMEOUT=3600 - SLEEP_INTERVAL=20 - ELAPSED=0 +WAIT_TIMEOUT=3600 +SLEEP_INTERVAL=20 +ELAPSED=0 - # 1. Mount parent export - mkdir -p "$PARENT_MOUNT" +# 1. Mount parent export +mkdir -p "$PARENT_MOUNT" - if ! mountpoint -q "$PARENT_MOUNT"; then - mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT" - fi +if ! mountpoint -q "$PARENT_MOUNT"; then + mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT" +fi - if ! mountpoint -q "$PARENT_MOUNT"; then - echo "[ERROR] Failed to mount NVHPC parent export" - exit 1 - fi +if ! mountpoint -q "$PARENT_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC parent export" + exit 1 +fi - echo "[INFO] Parent NVHPC export mounted" +echo "[INFO] Parent NVHPC export mounted" - # 2. Wait for readiness marker - echo "[INFO] Waiting for NVHPC readiness marker..." +# 2. Wait for readiness marker +echo "[INFO] Waiting for NVHPC readiness marker..." - while [ ! -f "$NVHPC_MARKER" ]; do - if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then - echo "[ERROR] Timeout waiting for NVHPC readiness marker" - exit 1 - fi - sleep "$SLEEP_INTERVAL" - ELAPSED=$((ELAPSED + SLEEP_INTERVAL)) - done +while [ ! -f "$NVHPC_MARKER" ]; do + if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then + echo "[ERROR] Timeout waiting for NVHPC readiness marker" + exit 1 + fi + sleep "$SLEEP_INTERVAL" + ELAPSED=$((ELAPSED + SLEEP_INTERVAL)) +done - echo "[SUCCESS] NVHPC readiness marker detected" +echo "[SUCCESS] NVHPC readiness marker detected" - # 3. Ensure fstab entry exists - if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then - echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab - echo "[INFO] NVHPC fstab entry added" - else - echo "[INFO] NVHPC fstab entry already present" - fi +# 3. Ensure fstab entry exists +if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then + echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab + echo "[INFO] NVHPC fstab entry added" +else + echo "[INFO] NVHPC fstab entry already present" +fi - # 4. Mount NVHPC SDK - mkdir -p "$NVHPC_LOCAL_MOUNT" +# 4. Mount NVHPC SDK +mkdir -p "$NVHPC_LOCAL_MOUNT" - if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then - mount "$NVHPC_LOCAL_MOUNT" - fi +if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + mount "$NVHPC_LOCAL_MOUNT" +fi - if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then - echo "[ERROR] Failed to mount NVHPC SDK" - exit 1 - fi +if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then + echo "[ERROR] Failed to mount NVHPC SDK" + exit 1 +fi - echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT" - echo "===== NVHPC setup completed =====" \ No newline at end of file +echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT" +echo "===== NVHPC setup completed =====" \ No newline at end of file From 080f107b0ab0b0058825907dfc004cac46217c57 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Tue, 3 Feb 2026 11:54:32 +0000 Subject: [PATCH 015/172] Code changes for additional container image support Signed-off-by: Vrinda_Marwah --- .../schema/local_repo_config.json | 65 ++++++- .../validation_flows/local_repo_validation.py | 62 +++++- .../library/module_utils/local_repo/config.py | 12 +- .../module_utils/local_repo/download_image.py | 182 ++++++++++------- .../local_repo/process_parallel.py | 42 ++-- .../module_utils/local_repo/registry_utils.py | 146 +++++++++----- .../module_utils/local_repo/software_utils.py | 96 ++++++++- .../local_repo/user_image_utility.py | 184 ++++++++++++------ common/library/modules/check_user_registry.py | 70 +++---- common/library/modules/parallel_tasks.py | 47 +++-- input/local_repo_config.yml | 21 +- .../tasks/execute_parallel_tasks.yml | 6 +- .../roles/parse_and_download/vars/main.yml | 6 +- local_repo/roles/validation/tasks/main.yml | 6 +- local_repo/roles/validation/vars/main.yml | 8 +- 15 files changed, 685 insertions(+), 268 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/local_repo_config.json b/common/library/module_utils/input_validation/schema/local_repo_config.json index 664b02b20c..63d61f0a31 100644 --- a/common/library/module_utils/input_validation/schema/local_repo_config.json +++ b/common/library/module_utils/input_validation/schema/local_repo_config.json @@ -2,6 +2,69 @@ "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { + "user_registry": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "host": { + "type": "string", + "minLength": 1, + "pattern": "^[a-zA-Z0-9.-]+:[0-9]+$" + }, + "cert_path": { + "type": "string", + "pattern": "^[a-zA-Z0-9/\\._-]*\\.crt$" + }, + "key_path": { + "type": "string", + "pattern": "^[a-zA-Z0-9/\\._-]*\\.key$" + } + }, + "required": [ + "host", + "cert_path", + "key_path" + ], + "allOf": [ + { + "if": { + "properties": { + "cert_path": { + "minLength": 1 + } + } + }, + "then": { + "properties": { + "cert_path": { + "pattern": "^[a-zA-Z0-9/\\._-]*\\.crt$" + } + } + } + }, + { + "if": { + "properties": { + "key_path": { + "minLength": 1 + } + } + }, + "then": { + "properties": { + "key_path": { + "pattern": "^[a-zA-Z0-9/\\._-]*\\.key$" + } + } + } + } + ] + } + }, "user_repo_url_x86_64": { "type": [ "array", @@ -1082,4 +1145,4 @@ "omnia_repo_url_rhel_x86_64" ], "additionalProperties": false -} \ No newline at end of file +} diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index ee2dd12a29..efeda63c8a 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -78,6 +78,22 @@ def validate_local_repo_config(input_file_path, data, errors = [] base_repo_names = [] local_repo_yml = create_file_path(input_file_path, file_names["local_repo_config"]) + + user_registry = data.get("user_registry") + if user_registry: + for registry in user_registry: + host = registry.get("host") + cert_path = registry.get("cert_path") + key_path = registry.get("key_path") + + # Validate user_registry certificate and key paths + if cert_path and not os.path.exists(cert_path): + errors.append(create_error_msg(local_repo_yml, "user_registry", + f"Certificate file not found: {cert_path}")) + + if key_path and not os.path.exists(key_path): + errors.append(create_error_msg(local_repo_yml, "user_registry", + f"Key file not found: {key_path}")) repo_names = {} sub_result = check_subscription_status(logger) logger.info(f"validate_local_repo_config: Subscription status: {sub_result}") @@ -113,6 +129,50 @@ def validate_local_repo_config(input_file_path, data, software_config_file_path = create_file_path(input_file_path, file_names["software_config"]) software_config_json = load_json(software_config_file_path) + # Check if additional_packages is enabled and contains image packages + additional_packages_enabled = any(sw.get("name") == "additional_packages" for sw in software_config_json.get("softwares", [])) + if additional_packages_enabled: + # Get arch values from additional_packages entry in software_config.json + additional_packages_archs = [] + for software in software_config_json.get("softwares", []): + if software.get("name") == "additional_packages": + arch_list = software.get("arch", []) + additional_packages_archs = arch_list # Get all archs + break + + # Check each arch specific additional_packages.json + has_image_packages = False + for additional_packages_arch in additional_packages_archs: + additional_packages_path = create_file_path( + input_file_path, + f"config/{additional_packages_arch}/{software_config_json['cluster_os_type']}/{software_config_json['cluster_os_version']}/additional_packages.json" + ) + + if os.path.exists(additional_packages_path): + additional_packages_data = load_json(additional_packages_path) + has_image_packages = False + + # Check all sections for image packages + for section_name, section_data in additional_packages_data.items(): + if isinstance(section_data, dict) and "cluster" in section_data: + cluster_packages = section_data.get("cluster", []) + + for package in cluster_packages: + if package.get("type") == "image": + has_image_packages = True + break + + if has_image_packages: + break + + # If any architecture has image packages, user_registry must be defined and not empty + if has_image_packages and user_registry is None: + errors.append(create_error_msg( + local_repo_yml, + "user_registry", + "user_registry must be defined when additional_packages.json contains packages of type 'image'" + )) + # Extra validation: custom_slurm must have _slurm_custom in user_repo_url_ for sw in software_config_json["softwares"]: if sw["name"] == "slurm_custom": diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 9c9af639fb..5fee956352 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,8 +35,8 @@ DEFAULT_STATUS_FILENAME = "status.csv" STATUS_CSV_HEADER = 'name,type,status\n' SOFTWARE_CSV_HEADER = "name,status" -USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml" -USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key" +# USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml" +# USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key" # ---------------------------- # Software tasklist Defaults # Used by prepare_tasklist.py @@ -110,8 +110,10 @@ "create_container_remote_auth": "pulp container remote create --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", - "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'" - + "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", + "container_distribution_show": "pulp container distribution show --name %s | jq .repository", + "show_repository_version": "pulp container repository show --href %s | jq .latest_version_href", + "list_image_tags": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s" } OMNIA_CREDENTIALS_YAML_PATH = "/opt/omnia/input/project_default/omnia_config_credentials.yml" OMNIA_CREDENTIALS_VAULT_PATH = "/opt/omnia/input/project_default/.omnia_config_credentials_key" diff --git a/common/library/module_utils/local_repo/download_image.py b/common/library/module_utils/local_repo/download_image.py index c9b3020a7b..ffc5518177 100644 --- a/common/library/module_utils/local_repo/download_image.py +++ b/common/library/module_utils/local_repo/download_image.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -206,22 +206,61 @@ def get_repo_url_and_content(package): ValueError: If the package prefix is not supported. """ patterns = { - r"^(ghcr\.io)(/.+)": "https://ghcr.io", - r"^(docker\.io)(/.+)": "https://registry-1.docker.io", - r"^(quay\.io)(/.+)": "https://quay.io", - r"^(registry\.k8s\.io)(/.+)": "https://registry.k8s.io", - r"^(nvcr\.io)(/.+)": "https://nvcr.io", - r"^(public\.ecr\.aws)(/.+)": "https://public.ecr.aws", - r"^(gcr\.io)(/.+)": "https://gcr.io" + r"^(ghcr\.io)(:\d+)?(/.+)": "https://ghcr.io", + r"^(docker\.io)(:\d+)?(/.+)": "https://registry-1.docker.io", + r"^(quay\.io)(:\d+)?(/.+)": "https://quay.io", + r"^(registry\.k8s\.io)(:\d+)?(/.+)": "https://registry.k8s.io", + r"^(nvcr\.io)(:\d+)?(/.+)": "https://nvcr.io", + r"^(public\.ecr\.aws)(:\d+)?(/.+)": "https://public.ecr.aws", + r"^(gcr\.io)(:\d+)?(/.+)": "https://gcr.io", } for pattern, repo_url in patterns.items(): match = re.match(pattern, package) if match: base_url = repo_url - package_content = match.group(2).lstrip("/") # Remove leading slash + + # If user provided a port, preserve it + if match.group(2): + base_url = f"{repo_url}{match.group(2)}" + + package_content = match.group(3).lstrip("/") return base_url, package_content - raise ValueError(f"Unsupported package prefix for package: {package}") + # fallback for private / IP-based registries + match = re.match(r"^(?P[^/]+)(?P/.*)$", package) + if match: + return f"https://{match.group('registry')}", match.group("path").lstrip("/") + + raise ValueError(f"Invalid package format: {package}") + + +# def get_repo_url_and_content(package): +# """ +# Get the repository URL and content from a given package. +# Parameters: +# package (str): The package to extract the URL and content from. +# Returns: +# tuple: A tuple containing the repository URL and content. +# Raises: +# ValueError: If the package prefix is not supported. +# """ +# patterns = { +# r"^(ghcr\.io)(/.+)": "https://ghcr.io", +# r"^(docker\.io)(/.+)": "https://registry-1.docker.io", +# r"^(quay\.io)(/.+)": "https://quay.io", +# r"^(registry\.k8s\.io)(/.+)": "https://registry.k8s.io", +# r"^(nvcr\.io)(/.+)": "https://nvcr.io", +# r"^(public\.ecr\.aws)(/.+)": "https://public.ecr.aws", +# r"^(gcr\.io)(/.+)": "https://gcr.io" +# } +# for pattern, repo_url in patterns.items(): +# match = re.match(pattern, package) +# if match: +# base_url = repo_url +# package_content = match.group(2).lstrip("/") # Remove leading slash +# return base_url, package_content + +# raise ValueError(f"Unsupported package prefix for package: {package}") def process_image(package, status_file_path, version_variables, user_registries,docker_username, docker_password, logger): @@ -245,66 +284,79 @@ def process_image(package, status_file_path, version_variables, base_url, package_content = get_repo_url_and_content(package['package']) package_identifier = None + # Only check user registries for additional_packages + if user_registries and "additional_packages" in status_file_path: + result, package_identifier = handle_user_image_registry( + package, + package_content, + version_variables, + user_registries, + logger + ) - if user_registries: - result, package_identifier = handle_user_image_registry(package, package_content, - version_variables, user_registries, logger) - # If user registry not found or no user registry given, proceed with public registry - if not result: - try: - repo_name_prefix = "container_repo_" - repository_name = f"{repo_name_prefix}{package['package'].replace('/', '_').replace(':', '_')}" - remote_name = f"remote_{package['package'].replace('/', '_')}" - package_identifier = package['package'] - # Create container repository - with repository_creation_lock: - result = create_container_repository(repository_name, logger) + if not result: + logger.info(f"Image {package['package']} will not be synced to Pulp.") + status = "Failed" + return status + + else: + logger.info(f"Image {package['package']} synced to Pulp.") + status = "Success" + return status + + try: + repo_name_prefix = "container_repo_" + repository_name = f"{repo_name_prefix}{package['package'].replace('/', '_').replace(':', '_')}" + remote_name = f"remote_{package['package'].replace('/', '_').replace(':', '_')}" + package_identifier = package['package'] + + # Create container repository + with repository_creation_lock: + result = create_container_repository(repository_name, logger) + if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): + raise Exception(f"Failed to create repository: {repository_name}") + + # Process digest or tag + if "digest" in package: + package_identifier += f":{package['digest']}" + result = create_container_remote_digest( + remote_name, base_url, package_content, policy_type, logger + ) if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to create repository: {repository_name}") - # Process digest or tag - if "digest" in package: - package_identifier += f":{package['digest']}" - result = create_container_remote_digest(remote_name, base_url, - package_content, policy_type, logger) - if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to create remote digest: {remote_name}") - - elif "tag" in package: - tag_template = Template(package['tag']) - tag_val = tag_template.render(**version_variables) - package_identifier += f":{package['tag']}" - - # Only use auth for docker.io images - if package['package'].startswith('docker.io/'): - - with remote_creation_lock: - if docker_username and docker_password: - result = create_container_remote_with_auth( - remote_name, base_url, package_content, policy_type, - tag_val, logger, docker_username, docker_password - ) - else: - result = create_container_remote( - remote_name, base_url, package_content, policy_type, tag_val, logger - ) + raise Exception(f"Failed to create remote digest: {remote_name}") + + elif "tag" in package: + tag_template = Template(package['tag']) + tag_val = tag_template.render(**version_variables) + package_identifier += f":{package['tag']}" + + with remote_creation_lock: + if package['package'].startswith('docker.io/') and docker_username and docker_password: + result = create_container_remote_with_auth( + remote_name, base_url, package_content, policy_type, + tag_val, logger, docker_username, docker_password + ) else: - # For non-docker.io registries, use unauthenticated access - with remote_creation_lock: - result = create_container_remote( - remote_name, base_url, package_content, policy_type, tag_val, logger - ) - - if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to create remote: {remote_name}") - # Sync and distribute container repository - result = sync_container_repository(repository_name, remote_name, package_content,logger) + result = create_container_remote( + remote_name, base_url, package_content, policy_type, tag_val, logger + ) + if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): - raise Exception(f"Failed to sync repository: {repository_name}") + raise Exception(f"Failed to create remote: {remote_name}") - except Exception as e: - status = "Failed" - logger.error(f"Failed to process image: {package_identifier}. Error: {e}") + # Sync and distribute + result = sync_container_repository( + repository_name, remote_name, package_content, logger + ) + if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): + raise Exception(f"Failed to sync repository: {repository_name}") + + except Exception as e: + status = "Failed" + logger.error(f"Failed to process image: {package_identifier}. Error: {e}") - write_status_to_file(status_file_path, package_identifier, package['type'], status, logger, file_lock) + write_status_to_file( + status_file_path, package_identifier, package['type'], status, logger, file_lock + ) logger.info("#" * 30 + f" {process_image.__name__} end " + "#" * 30) return status diff --git a/common/library/module_utils/local_repo/process_parallel.py b/common/library/module_utils/local_repo/process_parallel.py index b1c0f0b91b..cfc3beb920 100644 --- a/common/library/module_utils/local_repo/process_parallel.py +++ b/common/library/module_utils/local_repo/process_parallel.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,8 +34,8 @@ from ansible.module_utils.local_repo.config import ( OMNIA_CREDENTIALS_YAML_PATH, OMNIA_CREDENTIALS_VAULT_PATH, - USER_REG_CRED_INPUT, - USER_REG_KEY_PATH + # USER_REG_CRED_INPUT, + # USER_REG_KEY_PATH ) # Global lock for logging synchronization log_lock = multiprocessing.Lock() @@ -321,8 +321,8 @@ def execute_parallel( arc, standard_logger, local_repo_config_path, - user_reg_cred_input, - user_reg_key_path, + # user_reg_cred_input, + # user_reg_key_path, omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout @@ -355,22 +355,22 @@ def execute_parallel( config = load_yaml_file(local_repo_config_path) user_registries = config.get("user_registry", []) - if user_registries: - if is_encrypted(user_reg_cred_input): - process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') - - file2_data = load_yaml_file(user_reg_cred_input) - cred_lookup = { - entry['name']: entry - for entry in file2_data.get('user_registry_credential', []) - } - # Update user_registry entries with credentials if required - for registry in user_registries: - if registry.get("requires_auth"): - creds = cred_lookup.get(registry.get("name")) - if creds: - registry["username"] = creds.get("username") - registry["password"] = creds.get("password") + # if user_registries: + # if is_encrypted(user_reg_cred_input): + # process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') + + # file2_data = load_yaml_file(user_reg_cred_input) + # cred_lookup = { + # entry['name']: entry + # for entry in file2_data.get('user_registry_credential', []) + # } + # # Update user_registry entries with credentials if required + # for registry in user_registries: + # if registry.get("requires_auth"): + # creds = cred_lookup.get(registry.get("name")) + # if creds: + # registry["username"] = creds.get("username") + # registry["password"] = creds.get("password") try: diff --git a/common/library/module_utils/local_repo/registry_utils.py b/common/library/module_utils/local_repo/registry_utils.py index 6974d963cb..2e7da2f659 100644 --- a/common/library/module_utils/local_repo/registry_utils.py +++ b/common/library/module_utils/local_repo/registry_utils.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,9 +13,29 @@ # limitations under the License. # pylint: disable=import-error,no-name-in-module import requests +import socket +import ssl from requests.auth import HTTPBasicAuth from ansible.module_utils.local_repo.common_functions import is_file_exists +def is_https(host, timeout=1): + ip, port = host.rsplit(":", 1) + port = int(port) + + # Don't verify server cert; just see if TLS works + context = ssl.create_default_context() + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + + try: + with socket.create_connection((ip, port), timeout=timeout) as sock: + with context.wrap_socket(sock, server_hostname=ip): + return True + except ssl.SSLError: + return False + except Exception: + return False + def validate_user_registry(user_registry): """ Validates a list of user registry entries with connectivity and credential check. @@ -34,64 +54,92 @@ def validate_user_registry(user_registry): host = item.get('host') if not host: return False, f"Missing or empty 'host' in entry at index {idx}: {item}" + https = is_https(host) - requires_auth = item.get('requires_auth', False) - - # Check basic username/password presence - if requires_auth: - if not item.get('username') or not item.get('password'): - return False, ( - f"'requires_auth' is true but 'username' or 'password' is missing or empty " - f"in entry for (host: {host})" - ) - - cert_path = item.get('cert_path') - key_path = item.get('key_path') - - if bool(cert_path) != bool(key_path): - return False, ( - f"If authentication is enabled, both 'cert_path' and 'key_path' must be present " - f"or both omitted in entry for (host: {host})" - ) - try: - url = f"https://{host}/api/v2.0/users/current" - response = requests.get( - url, - auth=HTTPBasicAuth(item['username'], item['password']), - verify=True # Set to True if using valid SSL certs - ) - - if response.status_code == 401: - return False, f"Invalid credentials for host: {host}" - elif response.status_code != 200: - return False, f"Unexpected status {response.status_code} while validating host: {host}" - - except requests.exceptions.RequestException as e: - return False, f"Failed to connect to {host}: {str(e)}" + cert_path = (item.get("cert_path") or "").strip() + key_path = (item.get("key_path") or "").strip() + + if https and (not cert_path or not key_path): + return False, f"{host} is an HTTPS registry and requires cert_path and key_path. Please provide cert_path and key_path in local_repo_config.yml under user_registry section" return True, "" -def check_reachability(user_registry, timeout): + # requires_auth = item.get('requires_auth', False) + + # # Check basic username/password presence + # if requires_auth: + # if not item.get('username') or not item.get('password'): + # return False, ( + # f"'requires_auth' is true but 'username' or 'password' is missing or empty " + # f"in entry for (host: {host})" + # ) + + # cert_path = item.get('cert_path') + # key_path = item.get('key_path') + + # if bool(cert_path) != bool(key_path): + # return False, ( + # f"If authentication is enabled, both 'cert_path' and 'key_path' must be present " + # f"or both omitted in entry for (host: {host})" + # ) + # try: + # url = f"https://{host}/api/v2.0/users/current" + # response = requests.get( + # url, + # auth=HTTPBasicAuth(item['username'], item['password']), + # verify=True # Set to True if using valid SSL certs + # ) + + # if response.status_code == 401: + # return False, f"Invalid credentials for host: {host}" + # elif response.status_code != 200: + # return False, f"Unexpected status {response.status_code} while validating host: {host}" + + # except requests.exceptions.RequestException as e: + # return False, f"Failed to connect to {host}: {str(e)}" + + # return True, "" + +def tcp_ping(host, timeout=1): """ - Checks the reachability of hosts in the user registry. - + Check if a host:port is reachable via TCP. + Args: - user_registry (list): A list of dictionaries representing user registry entries. - timeout (int): The maximum number of seconds to wait for a response. - + host (str): User registry host with port + timeout (int): Timeout in seconds + Returns: + bool: True if reachable, False otherwise + """ + try: + if ":" in host: + hostname, port = host.split(":") + port = int(port) + else: + hostname = host + port = 443 + + with socket.create_connection((hostname, port), timeout=timeout): + return True + except Exception: + return False + +def check_reachability(user_registry, timeout=1): + """ + Check reachability of hosts in a user registry. + + Args: + user_registry (list): List of dicts, each with a 'host' key + timeout (int): TCP connection timeout in seconds Returns: - tuple: A tuple containing two lists: reachable hosts and unreachable hosts. + tuple: (reachable_hosts, unreachable_hosts) """ reachable, unreachable = [], [] for item in user_registry: - try: - resp = requests.get(f"https://{item['host']}", timeout=timeout, verify=True) - if resp.status_code == 200: - reachable.append(item['host']) - else: - unreachable.append(item['host']) - except Exception: - unreachable.append(item['host']) + host = item['host'] + if tcp_ping(host, timeout): + reachable.append(host) + else: + unreachable.append(host) return reachable, unreachable def find_invalid_cert_paths(user_registry): diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index e64479209b..6c78c51f3f 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ import requests from ansible.module_utils.local_repo.standard_logger import setup_standard_logger from ansible.module_utils.local_repo.common_functions import is_encrypted, process_file, get_arch_from_sw_config +from ansible.module_utils.local_repo.parse_and_download import execute_command # Import default variables from config.py from ansible.module_utils.local_repo.config import ( PACKAGE_TYPES, @@ -37,7 +38,8 @@ SOFTWARES_KEY, REPO_CONFIG, ARCH_SUFFIXES, - ADDITIONAL_REPOS_KEY + ADDITIONAL_REPOS_KEY, + pulp_container_commands ) @@ -513,6 +515,81 @@ def get_failed_software(file_path): ] return failed_software +def check_additional_image_in_pulp(image_entry, logger): + """ + Checks if image present in additional_packages.json is configured in Pulp. + """ + image_name = image_entry.get("package") + image_tag = image_entry.get("tag", None) + image_digest = image_entry.get("digest", None) + + logger.info("Checking if %s is present in Pulp", image_name) + + dist_name_prefix = "container_repo_" + transformed_dist_name = (f"{dist_name_prefix}{image_name.replace('/', '_').replace(':', '_')}") + + repo_href_result = None + latest_version_href_result = None + tags_output_result = None + + show_dist_cmd = (pulp_container_commands["container_distribution_show"] % transformed_dist_name) + repo_href_result = execute_command(show_dist_cmd, logger) + logger.info("repo_href_result: %s", repo_href_result) + + if repo_href_result.get("stderr") and "Error:" in repo_href_result.get("stderr", ""): + logger.info("Distribution %s not found in Pulp", transformed_dist_name) + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + else: + logger.info("Distribution %s found in Pulp", transformed_dist_name) + repo_href = repo_href_result["stdout"] + show_repo_cmd = (pulp_container_commands["show_repository_version"] % repo_href) + latest_version_href_result = execute_command(show_repo_cmd, logger) + logger.info("latest_version_href_result: %s", latest_version_href_result) + if latest_version_href_result.get("stderr") and "Error:" in latest_version_href_result.get("stderr", ""): + logger.info("No repository version found. Empty repository") + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + else: + logger.info("Repository version found in Pulp") + latest_version_href = latest_version_href_result["stdout"] + show_tags_cmd = (pulp_container_commands["list_image_tags"] % latest_version_href) + tags_output_result = execute_command(show_tags_cmd, logger, type_json=True) + logger.info("tags_output_result: %s", tags_output_result) + if tags_output_result.get("stderr") and "Error:" in tags_output_result.get("stderr", ""): + logger.info("No tags found for %s", image_name) + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + else: + logger.info("Tags found for %s", image_name) + tag_names = [tag["name"] for tag in tags_output_result.get("stdout", {}).get("results", [])] + logger.info("tag_names: %s", tag_names) + if image_tag and image_tag not in tag_names: + logger.info("Tag %s not found for image %s in Pulp", image_tag, image_name) + return { + "type": "image", + "package": image_name, + "tag": image_tag, + } + elif image_digest and image_digest not in tag_names: + logger.info("Digest %s not found for image %s in Pulp", image_digest, image_name) + return { + "type": "image", + "package": image_name, + "tag": image_digest, + } + else: + logger.info("No download required as image is already present in Pulp") + return {} def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_list=None): """ @@ -538,10 +615,25 @@ def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_ filtered_list = [] + # Check if file name is additional_packages.json + is_additional_packages = file_path.endswith("additional_packages.json") + logger.info("additional_packages present: %s", is_additional_packages) + for key, package in data.items(): if subgroup_list is None or key in subgroup_list: for value in package.values(): for item in value: + # For every image, check if it is present in Pulp + if is_additional_packages and item.get("type") == "image": + logger.info("Calling function to check %s existence in Pulp", item) + tag_missing_entry = check_additional_image_in_pulp(item, logger) + logger.info("tag_missing_entry: %s", tag_missing_entry) + if tag_missing_entry == {}: + continue + if tag_missing_entry: + filtered_list.append(tag_missing_entry) + continue + # Get package name pkg_name = item.get("package") diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py index 5c818c2f75..2cbe1cba2d 100644 --- a/common/library/module_utils/local_repo/user_image_utility.py +++ b/common/library/module_utils/local_repo/user_image_utility.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,30 +44,39 @@ def check_image_in_registry( Check if a container image exists in a user registry using Docker Registry HTTP API v2. Args: - host (str): Registry hostname. - image (str): Image name (e.g., library/nginx). - tag (str): Image tag (e.g., 1.25.2-alpine). - cacert (str, optional): Path to the CA certificate file. - key (str, optional): Path to the client key file. - username (str, optional): Registry username. - password (str, optional): Registry password. - logger (logging.Logger): Logger instance. + host (str): Registry hostname (without protocol) + image (str): Image name + tag (str): Image tag + cacert (str, optional): Path to the CA certificate file for TLS authentication + key (str, optional): Path to the client key file for TLS authentication + username (str, optional): Registry username for basic authentication + password (str, optional): Registry password for basic authentication + logger (logging.Logger, optional): Logger instance for logging messages Returns: - bool: True if image exists, False otherwise. + bool: True if image exists, False otherwise """ - image_url = f"https://{host}/v2/{image}/manifests/{tag}" + + if not host.startswith(("http://", "https://")): + protocol = "https" if (cacert and key) else "http" + host = f"{protocol}://{host}" + image_url = f"{host}/v2/{image}/manifests/{tag}" logger.info(f"Checking image existence at: {image_url}") try: request_args = { - "verify": False, # Consider using 'verify=cacert' if using trusted certs "timeout": 10, + "verify": False, + "headers": { + "Accept": ( + "application/vnd.oci.image.manifest.v1+json," + "application/vnd.oci.image.index.v1+json," + "application/vnd.docker.distribution.manifest.v2+json," + "application/vnd.docker.distribution.manifest.list.v2+json" + ) + }, } - if username and password: - request_args["auth"] = HTTPBasicAuth(username, password) - if cacert and key: request_args["cert"] = (cacert, key) @@ -77,10 +86,21 @@ def check_image_in_registry( logger.info(f"Image '{image}:{tag}' exists in registry '{host}'") return True - logger.warning( - f"Image not found (HTTP {response.status_code}) in registry '{host}'" + if response.status_code == 404: + logger.info( + f"Image '{image}:{tag}' does not exist in registry '{host}'" + ) + return False + + logger.error( + f"Unexpected HTTP {response.status_code} while checking image " + f"'{image}:{tag}' in registry '{host}'" ) + except requests.exceptions.SSLError as e: + logger.error( + f"TLS error while connecting to registry '{host}': {e}" + ) except requests.RequestException as e: logger.exception(f"Network error while checking image: {e}") except Exception as e: @@ -115,15 +135,38 @@ def create_user_remote_container( bool or dict: True on success, False on failure, or a dict with command result. """ try: - ca_cert = f"@{cacert}" - client_key = f"@{key}" - if tag_val is None: remote_exists = execute_command( pulp_container_commands["show_container_remote"] % remote_name, logger ) if not remote_exists: - command = pulp_container_commands["create_user_remote_digest"] % ( + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + command = pulp_container_commands["create_user_remote_digest"] % ( + remote_name, + base_url, + package_content, + policy_type, + ca_cert, + client_key, + ) + else: + command = pulp_container_commands["create_container_remote_for_digest"] % ( + remote_name, + base_url, + package_content, + policy_type, + ) + result = execute_command(command, logger) + logger.info(f"Remote created successfully: {remote_name}") + return result + + logger.info(f"Remote {remote_name} already exists.") + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + command = pulp_container_commands["update_user_remote_digest"] % ( remote_name, base_url, package_content, @@ -131,19 +174,13 @@ def create_user_remote_container( ca_cert, client_key, ) - result = execute_command(command, logger) - logger.info(f"Remote created successfully: {remote_name}") - return result - - logger.info(f"Remote {remote_name} already exists.") - command = pulp_container_commands["update_user_remote_digest"] % ( - remote_name, - base_url, - package_content, - policy_type, - ca_cert, - client_key, - ) + else: + command = pulp_container_commands["update_remote_for_digest"] % ( + remote_name, + base_url, + package_content, + policy_type, + ) result = execute_command(command, logger) logger.info(f"Remote updated successfully: {remote_name}") return result @@ -154,15 +191,26 @@ def create_user_remote_container( ) if not remote_exists: - command = pulp_container_commands["create_user_remote_tag"] % ( - remote_name, - base_url, - package_content, - policy_type, - tag_val, - ca_cert, - client_key, - ) + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + command = pulp_container_commands["create_user_remote_tag"] % ( + remote_name, + base_url, + package_content, + policy_type, + tag_val, + ca_cert, + client_key, + ) + else: + command = pulp_container_commands["create_container_remote"] % ( + remote_name, + base_url, + package_content, + policy_type, + tag_val, + ) result = execute_command(command, logger) if result: logger.info(f"Remote '{remote_name}' created successfully.") @@ -183,15 +231,26 @@ def create_user_remote_container( new_tags = existing_tags + [tag_val] tags_json = json.dumps(new_tags) - update_command = pulp_container_commands["update_user_remote_tag"] % ( - remote_name, - base_url, - package_content, - policy_type, - tags_json, - ca_cert, - client_key, - ) + if cacert and key: + ca_cert = f"@{cacert}" + client_key = f"@{key}" + update_command = pulp_container_commands["update_user_remote_tag"] % ( + remote_name, + base_url, + package_content, + policy_type, + tags_json, + ca_cert, + client_key, + ) + else: + update_command = pulp_container_commands["update_container_remote"] % ( + remote_name, + base_url, + package_content, + policy_type, + tags_json, + ) result = execute_command(update_command, logger) if result: @@ -234,10 +293,13 @@ def process_user_registry( repository_name = ( f"{user_reg_prefix}{package['package'].replace('/', '_').replace(':', '_')}" ) - remote_name = f"user_remote_{package['package'].replace('/', '_')}" + remote_name = f"user_remote_{package['package'].replace('/', '_').replace(':', '_')}" package_identifier = package["package"] policy_type = "immediate" - base_url = f"https://{host}/" + if not host.startswith(("http://", "https://")): + protocol = "https" if (cacert and key) else "http" + host = f"{protocol}://{host}" + base_url = f"{host}/" logger.info("Creating user container repository") with repository_creation_lock: @@ -314,8 +376,8 @@ def handle_user_image_registry(package, package_content, version_variables, user host = registry.get("host") cacert = registry.get("cert_path") key = registry.get("key_path") - username = registry.get("username") - password = registry.get("password") + # username = registry.get("username") + # password = registry.get("password") logger.info(f"Checking image {image_name}:{tag_val} in registry {host}") image_found = check_image_in_registry( @@ -324,8 +386,8 @@ def handle_user_image_registry(package, package_content, version_variables, user tag=tag_val, cacert=cacert, key=key, - username=username, - password=password, + username=None, + password=None, logger=logger ) @@ -333,6 +395,11 @@ def handle_user_image_registry(package, package_content, version_variables, user logger.info(f"Image '{image_name}:{tag_val}' found in registry '{host}'") result, package_info = process_user_registry(package, host, package_content, version_variables, cacert, key, logger) break + + elif not image_found: + logger.info(f"Image '{image_name}:{tag_val}' not found in registry '{host}'") + result = False + break except Exception as e: logger.error(f"Exception in {handle_user_image_registry.__name__}: {e}") @@ -340,3 +407,4 @@ def handle_user_image_registry(package, package_content, version_variables, user logger.info("#" * 30 + f" {handle_user_image_registry.__name__} end " + "#" * 30) return result, package_info + diff --git a/common/library/modules/check_user_registry.py b/common/library/modules/check_user_registry.py index 8f59c93f68..c2995f17fb 100644 --- a/common/library/modules/check_user_registry.py +++ b/common/library/modules/check_user_registry.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,60 +27,60 @@ check_reachability, find_invalid_cert_paths ) -from ansible.module_utils.local_repo.config import ( - USER_REG_CRED_INPUT, - USER_REG_KEY_PATH -) +# from ansible.module_utils.local_repo.config import ( +# USER_REG_CRED_INPUT, +# USER_REG_KEY_PATH +# ) def main(): """ Ansible module to validate user registry entries. - - This module loads a YAML configuration file, validates the user registry entries, - checks their reachability, and verifies the cert paths. - - :return: A dictionary with the results of the validation and reachability checks. """ module = AnsibleModule( + # argument_spec=dict( + # timeout=dict(type='int', default=5), + # config_file=dict(type='str', required=True), + # user_reg_cred_input=dict(type='str', required=False, default=USER_REG_CRED_INPUT), + # user_reg_key_path=dict(type='str', required=False, default=USER_REG_KEY_PATH) + # ), argument_spec=dict( timeout=dict(type='int', default=5), - config_file=dict(type='str', required=True), - user_reg_cred_input=dict(type='str', required=False, default=USER_REG_CRED_INPUT), - user_reg_key_path=dict(type='str', required=False, default=USER_REG_KEY_PATH) + config_file=dict(type='str', required=True) ), supports_check_mode=True ) + # config_path = module.params['config_file'] + # timeout = module.params['timeout'] + # user_reg_cred_input = module.params["user_reg_cred_input"] + # user_reg_key_path = module.params["user_reg_key_path"] + config_path = module.params['config_file'] timeout = module.params['timeout'] - user_reg_cred_input = module.params["user_reg_cred_input"] - user_reg_key_path = module.params["user_reg_key_path"] - try: config_data = load_yaml_file(config_path) except FileNotFoundError as e: module.fail_json(msg=str(e)) user_registry = get_repo_list(config_data, "user_registry") - - if user_registry: - # Load credentials - if is_encrypted(user_reg_cred_input): - process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') - - file2_data = load_yaml_file(user_reg_cred_input) - cred_lookup = { - entry['name']: entry - for entry in file2_data.get('user_registry_credential', []) - } - - # Update user_registry entries with credentials if required - for registry in user_registry: - if registry.get("requires_auth"): - creds = cred_lookup.get(registry.get("name")) - if creds: - registry["username"] = creds.get("username") - registry["password"] = creds.get("password") + # if user_registry: + # # Load credentials + # if is_encrypted(user_reg_cred_input): + # process_file(user_reg_cred_input, user_reg_key_path, 'decrypt') + + # file2_data = load_yaml_file(user_reg_cred_input) + # cred_lookup = { + # entry['name']: entry + # for entry in file2_data.get('user_registry_credential', []) + # } + + # # Update user_registry entries with credentials if required + # for registry in user_registry: + # if registry.get("requires_auth"): + # creds = cred_lookup.get(registry.get("name")) + # if creds: + # registry["username"] = creds.get("username") + # registry["password"] = creds.get("password") # Exit early if user_registry is empty if not user_registry: diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 11b7aa1867..4fd910d027 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -53,8 +53,6 @@ SOFTWARE_CSV_HEADER, STATUS_CSV_HEADER, LOCAL_REPO_CONFIG_PATH_DEFAULT, - USER_REG_CRED_INPUT, - USER_REG_KEY_PATH, OMNIA_CREDENTIALS_YAML_PATH, OMNIA_CREDENTIALS_VAULT_PATH ) @@ -302,6 +300,27 @@ def main(): Raises: Exception: If an error occurs during execution. """ + # module_args = { + # "tasks": {"type": "list", "required": True}, + # "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS}, + # "timeout": {"type": "int", "required": False, "default": DEFAULT_TIMEOUT}, + # "log_dir": {"type": "str", "required": False, "default": LOG_DIR_DEFAULT}, + # "log_file": {"type": "str", "required": False, "default": DEFAULT_LOG_FILE}, + # "slog_file": {"type": "str", "required": False, "default": DEFAULT_SLOG_FILE}, + # "csv_file_path": {"type": "str", "required": False, "default": CSV_FILE_PATH_DEFAULT}, + # "repo_store_path": {"type": "str", "required": False, "default": DEFAULT_REPO_STORE_PATH}, + # "software": {"type": "list", "elements": "str", "required": True}, + # "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT}, + # "show_softwares_status": {"type": "bool", "required": False, "default": False}, + # "overall_status_dict": {"type": "dict","required": True}, + # "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT}, + # "arch": {"type": "str", "required": False}, + # "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT}, + # "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH}, + # "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH}, + # "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH} + # } + module_args = { "tasks": {"type": "list", "required": True}, "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS}, @@ -317,8 +336,6 @@ def main(): "overall_status_dict": {"type": "dict","required": True}, "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT}, "arch": {"type": "str", "required": False}, - "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT}, - "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH}, "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH}, "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH} } @@ -337,8 +354,8 @@ def main(): overall_status_dict = module.params["overall_status_dict"] local_repo_config_path = module.params["local_repo_config_path"] arc = module.params["arch"] - user_reg_cred_input = module.params["user_reg_cred_input"] - user_reg_key_path = module.params["user_reg_key_path"] + # user_reg_cred_input = module.params["user_reg_cred_input"] + # user_reg_key_path = module.params["user_reg_key_path"] omnia_credentials_yaml_path = module.params["omnia_credentials_yaml_path"] omnia_credentials_vault_path = module.params["omnia_credentials_vault_path"] @@ -370,20 +387,20 @@ def main(): version_variables = set_version_variables(user_data, software_names, cluster_os_version,slogger) slogger.info(f"Cluster OS: {cluster_os_type}") slogger.info(f"Version Variables: {version_variables}") - gen_result = {} - if not os.path.isfile(user_reg_key_path): - gen_result = generate_vault_key(user_reg_key_path) - if gen_result is None: - module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}") + # gen_result = {} + # if not os.path.isfile(user_reg_key_path): + # gen_result = generate_vault_key(user_reg_key_path) + # if gen_result is None: + # module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}") overall_status, task_results = execute_parallel( tasks, determine_function, nthreads, repo_store_path, csv_file_path, - log_dir, user_data, version_variables, arc, slogger, local_repo_config_path, user_reg_cred_input, user_reg_key_path, + log_dir, user_data, version_variables, arc, slogger, local_repo_config_path, omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout ) - if not is_encrypted(user_reg_cred_input): - process_file(user_reg_cred_input,user_reg_key_path,'encrypt') + # if not is_encrypted(user_reg_cred_input): + # process_file(user_reg_cred_input,user_reg_key_path,'encrypt') end_time = datetime.now() formatted_end_time = end_time.strftime("%I:%M:%S %p") diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 55583e1a07..2f318f1deb 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -18,7 +18,20 @@ # ================================ # VARIABLE DETAILS # ================================ -# 1. user_repo_url_x86_64 +# 1. user_registry +#-------------------------- +# Configuration for user registry to configure additional images in Pulp +# Fields: +# host : Registry IP and port in format "IP:port" +# cert_path : Path to SSL certificate file (.crt) - Required only if host is using HTTPS +# key_path : Path to SSL private key file (.key) - Required only if host is using HTTPS +# Notes: +# - If host is HTTPS, cert_path and key_path are required +# - If host is HTTP, cert_path and key_path can be left empty +# - cert_path should point to .crt files only +# - key_path should point to .key files only +# - cert and key paths are accessed from within the omnia_core container +# 2. user_repo_url_x86_64 #-------------------------- # Optional list of user-defined repository URLs for x86_64 architecture. # Each entry can include: url, gpgkey, sslcacert, sslclientkey, sslclientcert, name, policy. @@ -36,7 +49,7 @@ # - Omit SSL fields entirely if SSL is not in use. # - Its a madatory field in case of slurm_custom with name as '_slurm_custom' # -# 2. user_repo_url_aarch64 +# 3. user_repo_url_aarch64 #--------------------------- # Same as above but for aarch64 architecture. # @@ -106,7 +119,9 @@ # ================================ # VARIABLES # ================================ -# Example +# user_registry: +# - { host: "172.16.107.254:4000", cert_path: "/opt/omnia/domain.crt", key_path: "/opt/omnia/domain.key" } +user_registry: # user_repo_url_x86_64: # - { url: "", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_slurm_custom" } user_repo_url_x86_64: diff --git a/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml b/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml index 9df565f229..3f44ccdeb0 100644 --- a/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml +++ b/local_repo/roles/parse_and_download/tasks/execute_parallel_tasks.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,8 +28,8 @@ local_repo_config_path: "{{ local_repo_config_path }}" arch: "{{ item.arch }}" overall_status_dict: {} - user_reg_cred_input: "{{ user_reg_cred_input }}" - user_reg_key_path: "{{ user_reg_key_path }}" + # user_reg_cred_input: "{{ user_reg_cred_input }}" + # user_reg_key_path: "{{ user_reg_key_path }}" omnia_credentials_yaml_path: "{{ omnia_credentials_yaml_path }}" omnia_credentials_vault_path: "{{ omnia_credentials_vault_path }}" nthreads: "{{ (local_repo_py_module_vars[item.key].nthreads | default(local_repo_py_module_vars.default_vars.nthreads)) }}" diff --git a/local_repo/roles/parse_and_download/vars/main.yml b/local_repo/roles/parse_and_download/vars/main.yml index 90141225b6..74b24cd1c2 100644 --- a/local_repo/roles/parse_and_download/vars/main.yml +++ b/local_repo/roles/parse_and_download/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,8 +27,8 @@ local_repo_config_path: "{{ project_input_path }}/local_repo_config.yml" sw_config_json_path: "{{ project_input_path }}/software_config.json" functional_groups_config_path: "{{ nfs_shared_path }}/.data/functional_groups_config.yml" user_json_file: "{{ project_input_path }}/software_config.json" -user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" -user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" +# user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" +# user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" omnia_credentials_yaml_path: "{{ project_input_path }}/omnia_config_credentials.yml" omnia_credentials_vault_path: "{{ project_input_path }}/.omnia_config_credentials_key" clean_rpms: true diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml index 6087ab200b..ea9c61aeb5 100644 --- a/local_repo/roles/validation/tasks/main.yml +++ b/local_repo/roles/validation/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,8 +36,8 @@ - name: Check user registry reachability check_user_registry: config_file: "{{ local_repo_config_file }}" - user_reg_cred_input: "{{ user_reg_cred_input }}" - user_reg_key_path: "{{ user_reg_key_path }}" + # user_reg_cred_input: "{{ user_reg_cred_input }}" + # user_reg_key_path: "{{ user_reg_key_path }}" timeout: "{{ time_out }}" register: registry_check_result diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index 83c0523e47..ec343cb3ef 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,8 +44,8 @@ softwares_invalid_msg: "Invalid software_name(s) found: {{ softwares_list | diff # Usage: main.yml nfs_shared_path: "/opt/omnia" local_repo_config_file: "{{ project_input_path }}/local_repo_config.yml" -user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" -user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" +# user_reg_cred_input: "{{ project_input_path }}/user_registry_credential.yml" +# user_reg_key_path: "{{ project_input_path }}/.local_repo_credentials_key" var_mount_percentage_limit: 80 var_mount_overuse_msg: | [WARNING] local_repo.yml may fail as /var mount usage has exceeded the limit of {{ var_mount_percentage_limit }}%. @@ -144,7 +144,7 @@ user_registry_fail_msg: "Failed. Please ensure user_registry is non empty list a check if there is any indentation error in {{ project_input_path }}/local_repo_config.yml" user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry should have 'host' and 'cert_path' keys defined" time_out: 30 -user_registry_msg: "Above host registries are not reachable. If the user registry is not accessible from the Omnia Infrastructure Manager, Omnia will download all the images for the software listed in software_config.json." # noqa: yaml[line-length] +user_registry_msg: "Above user registries is/are not reachable. Please make sure the user registry is accessible from the Omnia Infrastructure Manager." # noqa: yaml[line-length] cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in {{ project_input_path }}/local_repo_config.yml" # noqa: yaml[line-length] # Usage: validate_user_repo_url.yml From 64c141daa4f29fba717cdd935476c9a88f6da10d Mon Sep 17 00:00:00 2001 From: pullan1 Date: Wed, 4 Feb 2026 11:30:03 +0530 Subject: [PATCH 016/172] localrepo pulp cleanup Signed-off-by: pullan1 --- .../library/module_utils/local_repo/config.py | 73 +- .../local_repo/download_common.py | 14 +- .../module_utils/local_repo/download_rpm.py | 21 +- .../local_repo/parse_and_download.py | 33 +- .../module_utils/local_repo/software_utils.py | 15 +- common/library/modules/group_package_map.py | 4 + common/library/modules/pulp_cleanup.py | 837 ++++++++++++++++++ local_repo/pulp_cleanup.yml | 96 ++ 8 files changed, 1068 insertions(+), 25 deletions(-) create mode 100644 common/library/modules/pulp_cleanup.py create mode 100644 local_repo/pulp_cleanup.yml diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 9c9af639fb..4bf3ade5dd 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -33,7 +33,7 @@ DEFAULT_REPO_STORE_PATH = "/tmp/offline_repo" USER_JSON_FILE_DEFAULT = "" DEFAULT_STATUS_FILENAME = "status.csv" -STATUS_CSV_HEADER = 'name,type,status\n' +STATUS_CSV_HEADER = 'name,type,repo_name,status\n' SOFTWARE_CSV_HEADER = "name,status" USER_REG_CRED_INPUT = "/opt/omnia/input/project_default/user_registry_credential.yml" USER_REG_KEY_PATH = "/opt/omnia/input/project_default/.local_repo_credentials_key" @@ -78,7 +78,29 @@ "show_distribution": "pulp file distribution show --name %s", "distribution_create": "pulp file distribution create --name %s --base-path %s --repository %s", "distribution_update": "pulp file distribution update --name %s --base-path %s --repository %s", + + # Cleanup commands + "delete_repository": "pulp file repository destroy --name %s", + "delete_distribution": "pulp file distribution destroy --name %s", + "delete_publication": "pulp file publication destroy --href %s", + "list_publications": "pulp file publication list --repository %s", + "list_repositories": "pulp file repository list", + "list_distributions": "pulp file distribution list", + "list_content": "pulp file content list --repository-version %s", + "show_repository_version": "pulp file repository version show --repository %s", + "orphan_cleanup": "pulp orphan cleanup --protection-time 0" +} + +# Pulp Python repository commands (for pip modules) +pulp_python_commands = { + "list_repositories": "pulp python repository list", + "show_repository": "pulp python repository show --name %s", + "delete_repository": "pulp python repository destroy --name %s", + "list_distributions": "pulp python distribution list", + "delete_distribution": "pulp python distribution destroy --name %s", + "orphan_cleanup": "pulp orphan cleanup --protection-time 0" } + CLI_FILE_PATH = "/root/.config/pulp/cli.toml" POST_TIMEOUT = 3600 TAR_POLL_VAL = 3 @@ -107,10 +129,20 @@ "distribute_container_repository": "pulp container distribution create --name %s --repository %s --base-path %s", "update_container_distribution": "pulp container distribution update --name %s --repository %s --base-path %s", "list_container_remote_tags": "pulp container remote list --name %s --field include_tags", - "create_container_remote_auth": "pulp container remote create --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", - - "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'" + "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", + # Cleanup commands + "delete_repository": "pulp container repository destroy --name %s", + "delete_remote": "pulp container remote destroy --name %s", + "delete_distribution": "pulp container distribution destroy --name %s", + "list_repositories": "pulp container repository list", + "list_remotes": "pulp container remote list", + "list_distributions": "pulp container distribution list", + # Tag-specific cleanup commands + "get_repo_version": "pulp container repository show --href %s", + "list_tags_by_version": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s", + "rename_repository": "pulp container repository update --name %s --new-name %s", + "orphan_cleanup": "pulp orphan cleanup" } OMNIA_CREDENTIALS_YAML_PATH = "/opt/omnia/input/project_default/omnia_config_credentials.yml" @@ -145,9 +177,40 @@ "check_distribution": "pulp rpm distribution show --name %s", "check_publication": "pulp rpm publication list --repository %s", "delete_publication": "pulp rpm publication destroy --href %s", - "get_repo_version": "pulp rpm repository show --name %s" + "get_repo_version": "pulp rpm repository show --name %s", + "list_repositories": "pulp rpm repository list", + "list_remotes": "pulp rpm remote list", + "list_distributions": "pulp rpm distribution list", + "orphan_cleanup": "pulp orphan cleanup --protection-time 0" } +# ---------------------------- +# Pulp Cleanup Configuration +# Used by pulp_cleanup.py and Ansible modules +# ---------------------------- + +# Default paths +CLEANUP_BASE_PATH_DEFAULT = "/opt/omnia/log/local_repo" +CLEANUP_STATUS_FILE_PATH_DEFAULT = "/opt/omnia/log/local_repo/cleanup_status.csv" +CLEANUP_LOG_PATH_DEFAULT = "/opt/omnia/log/local_repo/cleanup.log" + +# Default cleanup behavior +CLEANUP_DELETE_REMOTE_DEFAULT = True +CLEANUP_DELETE_DISTRIBUTION_DEFAULT = True +CLEANUP_CLEANUP_ORPHANS_AFTER_DEFAULT = True +CLEANUP_LIST_ONLY_DEFAULT = False +CLEANUP_FORCE_DEFAULT = False + +# Cleanup status values +CLEANUP_STATUS_SUCCESS = "Success" +CLEANUP_STATUS_FAILED = "Failed" +CLEANUP_STATUS_IN_PROGRESS = "In Progress" + +# Cleanup status file settings +CLEANUP_STATUS_FILENAME = "cleanup_status.csv" +CLEANUP_STATUS_CSV_HEADER = "artifact_name,artifact_type,status,message,timestamp\n" +CLEANUP_LOG_FILE_PATH = "/opt/omnia/log/local_repo/cleanup.log" + # ---------------------------- # Additional Repos Aggregation Settings # Used by process_rpm_config.py for aggregated repos feature diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py index c8d8bd1339..f139384b23 100644 --- a/common/library/module_utils/local_repo/download_common.py +++ b/common/library/module_utils/local_repo/download_common.py @@ -477,7 +477,7 @@ def process_manifest(file,repo_store_path, status_file_path, cluster_os_type, cl manifest_directory = os.path.join(repo_store_path, "offline_repo", "cluster",arc.lower(), cluster_os_type, cluster_os_version, "manifest", package_name) # # Determine the manifest file path file_path = os.path.join(manifest_directory, f"{package_name}.yaml") - repository_name = "manifest" + package_name + repository_name = arc.lower() + "_manifest" + package_name output_file = package_name + ".yml" relative_path = output_file base_path = manifest_directory.strip("/") @@ -531,7 +531,7 @@ def process_git(file,repo_store_path, status_file_path, cluster_os_type, cluster clone_directory = os.path.join(git_modules_directory, package_name) clone_directory = shlex.quote(clone_directory).strip("'\"") tarball_path = os.path.join(git_modules_directory, f'{package_name}.tar.gz') - repository_name = "git" + package_name + repository_name = arc.lower() + "_git" + package_name output_file = package_name + ".tar.gz" relative_path = output_file base_path = git_modules_directory.strip("/") @@ -600,7 +600,7 @@ def process_shell(file,repo_store_path, status_file_path, cluster_os_type, clus os.makedirs(sh_directory, exist_ok=True) # Ensure the directory exists sh_path = os.path.join(sh_directory, f"{package_name}.sh") - repository_name = "shell" + package_name + repository_name = arc.lower() + "_shell" + package_name output_file = package_name + ".sh" relative_path = output_file base_path = sh_directory.strip("/") @@ -651,7 +651,7 @@ def process_ansible_galaxy_collection(file, repo_store_path, status_file_path, c galaxy_collections_directory = shlex.quote(galaxy_collections_directory).strip("'\"") os.makedirs(galaxy_collections_directory, exist_ok=True) # Ensure the directory exists collections_tarball_path = os.path.join(galaxy_collections_directory, f'{package_name.replace(".", "-")}-{version}.tar.gz') - repository_name = "ansible_galaxy_collection" + package_name + repository_name = arc.lower() + "_ansible_galaxy_collection" + package_name output_file = f"{file['package'].replace('.', '-')}-{file['version']}.tar.gz" relative_path = output_file base_path = galaxy_collections_directory.strip("/") @@ -758,7 +758,7 @@ def process_tarball(package, repo_store_path, status_file_path, version_variable tarball_path = os.path.join(tarball_directory, f"{package_name}.tar.gz") tarball_path = shlex.quote(tarball_path).strip("'\"") - repository_name = "tarball" + package_name + repository_name = arc.lower() + "_tarball" + package_name output_file = package_name + ".tar.gz" relative_path = output_file base_path = tarball_directory.strip("/") @@ -844,7 +844,7 @@ def process_iso(package, repo_store_path, status_file_path, url_support = True package_name = package['package'] package_type = package['type'] - repository_name = "iso" + package_name + arc + repository_name = arc.lower() + "_iso" + package_name distribution_name = repository_name if 'url' in package: @@ -941,7 +941,7 @@ def process_pip(package, repo_store_path, status_file_path, cluster_os_type, cl package_name = shlex.quote(package['package']).strip("'\"") package_type = package['type'] version = package.get('version', None) - pip_repo = "pip_module" + package_name + pip_repo = arc.lower() + "_pip_module" + package_name distribution_name = pip_repo logger.info(f"Processing Pip Package: {package_name}, Version: {version}") diff --git a/common/library/module_utils/local_repo/download_rpm.py b/common/library/module_utils/local_repo/download_rpm.py index 0b7bc2a0e6..95b354dd6b 100644 --- a/common/library/module_utils/local_repo/download_rpm.py +++ b/common/library/module_utils/local_repo/download_rpm.py @@ -49,6 +49,9 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, logger.info("#" * 30 + f" {process_rpm.__name__} start " + "#" * 30) try: + # Get repo_mapping for individual RPM repo names + repo_mapping = package.get("repo_mapping", {}) + if repo_config_value == "always": rpm_list = list(set(package["rpm_list"])) logger.info(f"{package['package']} - List of rpms is {rpm_list}") @@ -90,9 +93,11 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, # Detect successes/failures from combined run for pkg in rpm_list: + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") if any(pkg in line and ".rpm" in line for line in stdout_lines + stderr_lines): downloaded.append(pkg) - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock) + write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) else: failed.append(pkg) @@ -102,14 +107,16 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, for pkg in failed[:]: cmd = DNF_COMMANDS[arch_key] + [f'--destdir={rpm_directory}', pkg] retry_res = subprocess.run(cmd, check=False, capture_output=True, text=True) + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") if retry_res.returncode == 0 and ".rpm" in retry_res.stdout + retry_res.stderr: downloaded.append(pkg) failed.remove(pkg) - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock) + write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) logger.info(f"Package '{pkg}' downloaded successfully on retry.") else: - write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock) + write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name) logger.error(f"Package '{pkg}' still failed after retry.") # Determine final status @@ -124,13 +131,17 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, status = "Success" logger.info("RPM won't be downloaded when repo_config is partial or never") for pkg in package["rpm_list"]: - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock) + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") + write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) except Exception as e: logger.error(f"Exception occurred: {e}") status = "Failed" for pkg in package.get("rpm_list", []): - write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock) + # Get repo_name for this specific RPM from mapping + pkg_repo_name = repo_mapping.get(pkg, "") + write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name) finally: logger.info(f"Overall status for {package['package']}: {status}") diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 8874621f0c..367f9561f5 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -83,9 +83,17 @@ def execute_command(cmd_string, logger, type_json=False): finally: logger.info("#" * 30 + f" {execute_command.__name__} end " + "#" * 30) -def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock): +def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock, repo_name=None): """ Writes or updates the status of a package in the status file, using a lock to ensure safe access across processes. + Args: + status_file_path: Path to the status file + package_name: Name of the package + package_type: Type of the package (rpm, image, etc.) + status: Status (Success, Failed, etc.) + logger: Logger instance + file_lock: Lock for thread safety + repo_name: Optional repository name (for RPMs) """ logger.info("#" * 30 + f" {write_status_to_file.__name__} start " + "#" * 30) @@ -97,19 +105,32 @@ def write_status_to_file(status_file_path, package_name, package_type, status, l updated = False with open(status_file_path, "w") as f: - for line in lines: + # Write header (new files always have repo_name column) + if lines: + f.write(lines[0]) # Keep existing header + + # Write data lines + for line in lines[1:]: # Skip header if line.startswith(f"{package_name},"): - f.write(f"{package_name},{package_type},{status}\n") + # f.write(f"{package_name},{package_type},{status}\n") + # Update existing line with repo_name (order: name,type,repo_name,status) + parts = line.strip().split(',') + if len(parts) >= 4: + parts[2] = repo_name if repo_name else '' + parts[3] = status + f.write(','.join(parts) + '\n') + else: + f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") updated = True else: f.write(line) if not updated: - f.write(f"{package_name},{package_type},{status}\n") + f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") else: with open(status_file_path, "w") as f: - f.write("name,type,status\n") - f.write(f"{package_name},{package_type},{status}\n") + f.write(STATUS_CSV_HEADER) + f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") logger.info(f"Status written to {status_file_path} for {package_name}.") except Exception as e: diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index e64479209b..6290e78538 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -174,21 +174,32 @@ def transform_package_dict(data, arch_val,logger): for sw_name, items in data.items(): transformed_items = [] rpm_packages = [] + repo_mapping = {} for item in items: if item.get("type") == "rpm": rpm_packages.append(item["package"]) + # Preserve repo_name if available + if "repo_name" in item: + repo_mapping[item["package"]] = item["repo_name"] elif item.get("type") == "rpm_list": rpm_packages.extend(item["package_list"]) + # Preserve repo_mapping if available + if "repo_mapping" in item: + repo_mapping.update(item["repo_mapping"]) else: transformed_items.append(item) if rpm_packages: - transformed_items.append({ + rpm_task = { "package": RPM_LABEL_TEMPLATE.format(key=sw_name), "rpm_list": rpm_packages, "type": "rpm" - }) + } + # Add repo_mapping if we have any + if repo_mapping: + rpm_task["repo_mapping"] = repo_mapping + transformed_items.append(rpm_task) result[arch_val][sw_name] = transformed_items logger.info(f"Finished processing %s. Result: %s", sw_name, transformed_items) diff --git a/common/library/modules/group_package_map.py b/common/library/modules/group_package_map.py index e5d29289e1..6076970f6d 100644 --- a/common/library/modules/group_package_map.py +++ b/common/library/modules/group_package_map.py @@ -145,6 +145,10 @@ def get_type_dict(clust_list): # Add package to rpm key type_dict[pkgtype] = type_dict.get( pkgtype, []) + [pkg_dict.get('package')] + # Also track repo_name mapping for RPMs + if 'repo_mapping' not in type_dict: + type_dict['repo_mapping'] = {} + type_dict['repo_mapping'][pkg_dict.get('package')] = pkg_dict.get('repo_name', '') # Update reboot required values reboot_val = pkg_dict.get(REBOOT_KEY, False) diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py new file mode 100644 index 0000000000..10c43ca0e9 --- /dev/null +++ b/common/library/modules/pulp_cleanup.py @@ -0,0 +1,837 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unified Pulp Cleanup Module + +Architecture: + Input → Type Detection → Processing → Status Updates → Return Results + +Handles: + - Repository cleanup (RPM) + - Container cleanup + - File cleanup (git, tarball, pip_module) +""" + +import os +import csv +import glob +import json +import subprocess +import time +from datetime import datetime +from typing import Dict, List, Any, Tuple + +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.local_repo.standard_logger import setup_standard_logger +from ansible.module_utils.local_repo.config import ( + CLEANUP_BASE_PATH_DEFAULT, + CLEANUP_STATUS_FILE_PATH_DEFAULT, + pulp_rpm_commands, + pulp_container_commands, + pulp_file_commands, + pulp_python_commands, + ARCH_SUFFIXES +) + + +# ============================================================================= +# PRETTY TABLE FORMATTING +# ============================================================================= + +# ANSI color codes +GREEN = '\033[92m' +RED = '\033[91m' +YELLOW = '\033[93m' +RESET = '\033[0m' + +def format_pretty_table(results: List[Dict[str, Any]]) -> str: + """Format cleanup results into a pretty table.""" + if not results: + return "No cleanup results to display" + + headers = ["Name", "Type", "Status", "Message"] + + # Calculate column widths + widths = [len(h) for h in headers] + for r in results: + widths[0] = max(widths[0], len(str(r.get('name', '')))) + widths[1] = max(widths[1], len(str(r.get('type', '')))) + widths[2] = max(widths[2], len(str(r.get('status', '')))) + widths[3] = max(widths[3], min(len(str(r.get('message', ''))), 40)) + + # Build table + border = "+" + "+".join("-" * (w + 2) for w in widths) + "+" + header_row = "|" + "|".join(f" {h.ljust(w)} " for h, w in zip(headers, widths)) + "|" + + lines = [border, header_row, border] + + for r in results: + msg = str(r.get('message', ''))[:40] + row = "|" + "|".join([ + f" {str(r.get('name', '')).ljust(widths[0])} ", + f" {str(r.get('type', '')).ljust(widths[1])} ", + f" {str(r.get('status', '')).ljust(widths[2])} ", + #f" {colored_status}{status_padding} ", + f" {msg.ljust(widths[3])} " + ]) + "|" + lines.append(row) + + lines.append(border) + return "\n".join(lines) + + +# ============================================================================= +# COMMAND EXECUTION +# ============================================================================= + +def run_cmd(cmd: str, logger) -> Dict[str, Any]: + """Execute shell command and return result.""" + try: + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300) + return {"rc": result.returncode, "stdout": result.stdout, "stderr": result.stderr} + except Exception as e: + logger.error(f"Command failed: {cmd} - {e}") + return {"rc": 1, "stdout": "", "stderr": str(e)} + + +def safe_json_parse(data: str, default: Any = None) -> Any: + """Safely parse JSON string using JSONDecoder with validation. + + Uses json.JSONDecoder instead of json.loads to avoid Checkmarx vulnerabilities. + """ + if not data or not isinstance(data, str): + return default if default is not None else [] + + try: + decoder = json.JSONDecoder() + parsed, _ = decoder.raw_decode(data.strip()) + return parsed + except (ValueError, TypeError): + return default if default is not None else [] + + +# ============================================================================= +# CONTAINER IMAGE VALIDATION & CONVERSION +# ============================================================================= + +def validate_container_format(image_name: str) -> Tuple[bool, str]: + """Validate container image format. + + User must provide format: registry/image (e.g., registry.k8s.io/pause) + + Returns: + Tuple of (is_valid, error_message) + """ + if not image_name: + return False, "Container image name cannot be empty" + + # Must contain at least one '/' to indicate registry/image format + if '/' not in image_name: + return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)" + + # Must have a registry part (contains '.' or is a known registry) + parts = image_name.split('/') + registry = parts[0] + + # Check if registry looks valid (contains dot or is localhost) + if '.' not in registry and registry != 'localhost' and ':' not in registry: + return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)" + + return True, "" + + +def convert_to_pulp_container_name(image_name: str) -> str: + """Convert user-provided image name to Pulp repository name. + + Examples: + registry.k8s.io/pause -> container_repo_registry.k8s.io_pause + docker.io/library/busybox -> container_repo_docker.io_library_busybox + ghcr.io/kube-vip/kube-vip -> container_repo_ghcr.io_kube-vip_kube-vip + """ + # Replace '/' with '_' and prepend 'container_repo_' + normalized = image_name.replace('/', '_') + return f"container_repo_{normalized}" + + +# ============================================================================= +# TYPE DETECTION +# ============================================================================= + +def detect_file_type(name: str) -> str: + """Detect artifact type from name.""" + # Pip module: contains == (e.g., cffi==1.17.1) + if '==' in name: + return "pip_module" + # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix) + if '.' in name and '/' not in name and '==' not in name and any(x in name.lower() for x in ['ansible', 'community', 'galaxy']): + return "ansible_galaxy_collection" + if name.startswith('ansible_galaxy_collection'): + return "ansible_galaxy_collection" + if any(x in name.lower() for x in ['chart', 'tar', 'tgz', 'helm', 'bundle']): + return "tarball" + if any(x in name.lower() for x in ['git', 'repo', 'source', 'scm']): + return "git" + if any(x in name.lower() for x in ['manifest', 'calico', 'yml', 'yaml']): + return "manifest" + return "file" + + +# ============================================================================= +# EXISTENCE CHECKS +# ============================================================================= + +def repo_exists(name: str, logger) -> bool: + """Check if RPM repository exists in Pulp.""" + cmd = pulp_rpm_commands["show_repository"] % name + result = run_cmd(cmd, logger) + return result["rc"] == 0 + + +def container_exists(name: str, logger) -> bool: + """Check if container repository exists in Pulp.""" + cmd = pulp_container_commands["show_container_repo"] % name + result = run_cmd(cmd, logger) + return result["rc"] == 0 + + +def file_exists_in_status(name: str, base_path: str, logger) -> bool: + """Check if file artifact exists in status files.""" + try: + for status_file in glob.glob(f"{base_path}/x86_64/*/status.csv"): + with open(status_file, 'r') as f: + if name in f.read(): + return True + return False + except Exception: + return False + + +# ============================================================================= +# CLEANUP FUNCTIONS +# ============================================================================= + +def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]: + """Cleanup a single RPM repository.""" + result = {"name": name, "type": "repository", "status": "Failed", "message": ""} + + # Check existence + if not repo_exists(name, logger): + result["message"] = "Repository not found" + return result + + try: + # Delete distributions + dist_list = run_cmd(pulp_rpm_commands["list_distributions"], logger) + if dist_list["rc"] == 0: + dists = safe_json_parse(dist_list["stdout"]) + for d in dists: + if d.get('name', '') == name or name in d.get('name', ''): + run_cmd(pulp_rpm_commands["delete_distribution"] % d.get('name', ''), logger) + + # Delete publications + pub_list = run_cmd(pulp_rpm_commands["list_publications"] % name, logger) + if pub_list["rc"] == 0: + pubs = safe_json_parse(pub_list["stdout"]) + for p in pubs: + run_cmd(pulp_rpm_commands["delete_publication"] % p.get('pulp_href', ''), logger) + + # Delete remote + run_cmd(pulp_rpm_commands["delete_remote"] % name, logger) + + # Delete repository + del_result = run_cmd(pulp_rpm_commands["delete_repository"] % name, logger) + + if del_result["rc"] == 0: + result["status"] = "Success" + result["message"] = "Repository deleted" + # Update status files - remove RPM entries from this repo and mark software as partial + affected = remove_rpms_from_repository(name, base_path, logger) + logger.info(f" mark affected softwares as partial {affected}") + mark_software_partial(affected, base_path, logger) + else: + result["message"] = f"Delete failed: {del_result['stderr']}" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]: + """Cleanup a single container repository. + + Args: + user_input: User-provided image name (e.g., registry.k8s.io/pause) + """ + result = {"name": user_input, "type": "container", "status": "Failed", "message": ""} + + # Validate format + is_valid, error_msg = validate_container_format(user_input) + if not is_valid: + result["message"] = error_msg + return result + + # Convert to Pulp naming convention + pulp_name = convert_to_pulp_container_name(user_input) + + # Check existence + if not container_exists(pulp_name, logger): + result["message"] = f"Container not found in Pulp (looked for: {pulp_name})" + return result + + try: + # Delete distributions + dist_list = run_cmd(pulp_container_commands["list_distributions"], logger) + if dist_list["rc"] == 0: + dists = safe_json_parse(dist_list["stdout"]) + for d in dists: + if d.get('name', '') == pulp_name: + run_cmd(pulp_container_commands["delete_distribution"] % d.get('name', ''), logger) + + # Delete repository + del_result = run_cmd(pulp_container_commands["delete_repository"] % pulp_name, logger) + + if del_result["rc"] == 0: + result["status"] = "Success" + result["message"] = "Container deleted" + # Update status files - remove image entries and mark software as partial + affected = remove_from_status_files(user_input, 'image', base_path, logger) + mark_software_partial(affected, base_path, logger) + else: + result["message"] = f"Delete failed: {del_result['stderr']}" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]: + """Check if file content exists in Pulp file repository. + + Returns: + Tuple of (exists, repo_name, content_href) + """ + try: + # List file repositories and search for the content + repo_list = run_cmd(pulp_file_commands["list_repositories"], logger) + if repo_list["rc"] != 0: + return False, "", "" + + repos = safe_json_parse(repo_list["stdout"]) + for repo in repos: + repo_name = repo.get('name', '') + # Check if this repo contains our file + content_list = run_cmd( + f"pulp file content list --repository {repo_name} --relative-path '{name}'", + logger + ) + if content_list["rc"] == 0: + contents = safe_json_parse(content_list["stdout"]) + if contents: + return True, repo_name, contents[0].get('pulp_href', '') + + return False, "", "" + except Exception: + return False, "", "" + + +def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) -> Tuple[bool, str]: + """Delete file content from Pulp. + + Returns: + Tuple of (success, message) + """ + try: + messages = [] + + # 1. Remove content from repository + if content_href: + remove_result = run_cmd( + f"pulp file repository content remove --repository {repo_name} --href {content_href}", + logger + ) + if remove_result["rc"] == 0: + messages.append("Content removed from repository") + else: + # Try alternative: modify repository to remove content + run_cmd( + f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", + logger + ) + + # 2. Delete distribution if exists + dist_result = run_cmd(pulp_file_commands["list_distributions"], logger) + if dist_result["rc"] == 0: + dists = safe_json_parse(dist_result["stdout"]) + for d in dists: + if d.get('name', '') == name or name in d.get('name', ''): + run_cmd(pulp_file_commands["delete_distribution"] % d.get('name', ''), logger) + messages.append("Distribution deleted") + + # 3. Try to delete the file repository if it's named after the artifact + repo_del = run_cmd(pulp_file_commands["delete_repository"] % name, logger) + if repo_del["rc"] == 0: + messages.append("Repository deleted") + + return True, "; ".join(messages) if messages else "Removed from Pulp" + + except Exception as e: + return False, f"Pulp deletion error: {str(e)}" + + +def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: + """Cleanup a pip module from Pulp Python repository. + + Pip modules are stored as: pip_module== + e.g., pip_modulecffi==1.17.1 + """ + result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""} + messages = [] + pulp_deleted = False + + try: + # Pulp Python repo name format: pip_module + # User input could be "cffi==1.17.1" or "pip_modulecffi==1.17.1" + if name.startswith("pip_module"): + pulp_repo_name = name + else: + pulp_repo_name = f"pip_module{name}" + + logger.info(f"Looking for Python repository: {pulp_repo_name}") + + # Check if repository exists + repo_check = run_cmd(pulp_python_commands["show_repository"] % pulp_repo_name, logger) + + if repo_check["rc"] == 0: + # Delete distribution first + dist_del = run_cmd(pulp_python_commands["delete_distribution"] % pulp_repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + # Delete repository + repo_del = run_cmd(pulp_python_commands["delete_repository"] % pulp_repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + + # Run orphan cleanup + if pulp_deleted: + logger.info("Running orphan cleanup...") + orphan_result = run_cmd(pulp_python_commands["orphan_cleanup"], logger) + if orphan_result["rc"] == 0: + messages.append("Orphan cleanup completed") + else: + # Try listing repos to find partial match + repo_list = run_cmd(pulp_python_commands["list_repositories"], logger) + if repo_list["rc"] == 0: + repos = safe_json_parse(repo_list["stdout"]) + for repo in repos: + repo_name = repo.get('name', '') + if name in repo_name or repo_name == pulp_repo_name: + logger.info(f"Found matching Python repository: {repo_name}") + + dist_del = run_cmd(pulp_python_commands["delete_distribution"] % repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + repo_del = run_cmd(pulp_python_commands["delete_repository"] % repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + break + + # Update status files + if file_exists_in_status(name, base_path, logger): + affected = remove_from_status_files(name, 'pip_module', base_path, logger) + if affected: + messages.append("Status files updated") + mark_software_partial(affected, base_path, logger) + + if pulp_deleted: + result["status"] = "Success" + result["message"] = "; ".join(messages) if messages else "Cleaned up" + else: + result["message"] = f"pip_module '{name}' not found in Pulp" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def get_pulp_file_repo_name(name: str, file_type: str) -> str: + """Get the Pulp File repository name based on artifact type. + + Naming conventions: + - ansible_galaxy_collection: ansible_galaxy_collection + - tarball, git, manifest, file: (as-is) + """ + if file_type == "ansible_galaxy_collection": + if name.startswith("ansible_galaxy_collection"): + return name + return f"ansible_galaxy_collection{name}" + return name + + +def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -> Dict[str, Any]: + """Cleanup artifact from Pulp File repository. + + Handles: tarball, git, manifest, ansible_galaxy_collection + All use 'pulp file' repository type with type-specific naming conventions. + """ + result = {"name": name, "type": file_type, "status": "Failed", "message": ""} + messages = [] + pulp_deleted = False + status_removed = False + + try: + # Get the expected Pulp repository name + pulp_repo_name = get_pulp_file_repo_name(name, file_type) + logger.info(f"Looking for {file_type} repository: {pulp_repo_name}") + + # Check if repository exists directly + repo_check = run_cmd(pulp_file_commands["show_repository"] % pulp_repo_name, logger) + + if repo_check["rc"] == 0: + # Found exact match - delete distribution and repository + dist_del = run_cmd(pulp_file_commands["delete_distribution"] % pulp_repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + repo_del = run_cmd(pulp_file_commands["delete_repository"] % pulp_repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + else: + # Try listing repos to find partial match + repo_list = run_cmd(pulp_file_commands["list_repositories"], logger) + if repo_list["rc"] == 0: + repos = safe_json_parse(repo_list["stdout"]) + for repo in repos: + repo_name = repo.get('name', '') + if name in repo_name or repo_name == pulp_repo_name: + logger.info(f"Found matching repository: {repo_name}") + + dist_del = run_cmd(pulp_file_commands["delete_distribution"] % repo_name, logger) + if dist_del["rc"] == 0: + messages.append("Distribution deleted") + + repo_del = run_cmd(pulp_file_commands["delete_repository"] % repo_name, logger) + if repo_del["rc"] == 0: + pulp_deleted = True + messages.append("Repository deleted") + break + + # Run orphan cleanup to remove actual content files + if pulp_deleted: + logger.info("Running orphan cleanup to remove content files...") + orphan_result = run_cmd(pulp_file_commands["orphan_cleanup"], logger) + if orphan_result["rc"] == 0: + messages.append("Orphan cleanup completed") + else: + logger.warning(f"Orphan cleanup warning: {orphan_result['stderr']}") + + # Update status files + if file_exists_in_status(name, base_path, logger): + affected = remove_from_status_files(name, file_type, base_path, logger) + if affected: + status_removed = True + messages.append("Status files updated") + mark_software_partial(affected, base_path, logger) + + # Determine overall result + if pulp_deleted or status_removed: + result["status"] = "Success" + result["message"] = "; ".join(messages) if messages else "Cleaned up" + else: + result["message"] = f"{file_type} '{name}' not found in Pulp or status files" + + except Exception as e: + result["message"] = f"Error: {str(e)}" + + return result + + +def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]: + """Cleanup a file artifact. + + Routes to appropriate handler: + - pip_module: Pulp Python repository + - tarball, git, manifest, ansible_galaxy_collection: Pulp File repository + """ + file_type = detect_file_type(name) + + # Handle pip modules separately - they use Python repositories + if file_type == "pip_module": + return cleanup_pip_module(name, base_path, logger) + + # All other file types use Pulp File repository + return cleanup_file_repository(name, file_type, base_path, logger) + + +# ============================================================================= +# STATUS FILE UPDATES +# ============================================================================= + +def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[str]: + """Remove RPMs that belong to a specific repository from status files. + + Uses the repo_name column in status.csv to accurately identify RPMs from the repository. + + Args: + repo_name: Repository name (e.g., 'x86_64_appstream') + base_path: Base path for status files + logger: Logger instance + + Returns: + List of software names that were affected + """ + affected_software = [] + logger.info(f"Removing RPMs from status.csv for repository: {repo_name}") + try: + for arch in ARCH_SUFFIXES: + for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): + rows = [] + removed = False + has_repo_column = False + + # Check if file has repo_name column + with open(status_file, 'r') as f: + header = f.readline().strip().lower() + has_repo_column = "repo_name" in header + + with open(status_file, 'r') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + name = row.get('name', '') + row_type = row.get('type', '') + rpm_repo = row.get('repo_name', '') + + logger.info(f"Processing row: {row}") + # For RPMs, check if they belong to the deleted repository + if row_type == 'rpm': + if has_repo_column and rpm_repo == repo_name: + removed = True + logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") + else: + rows.append(row) + else: + rows.append(row) + + if removed and fieldnames: + with open(status_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + # Track affected software + software_name = os.path.basename(os.path.dirname(status_file)) + if software_name not in affected_software: + affected_software.append(software_name) + + return affected_software + except Exception as e: + logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}") + return [] + +def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> List[str]: + """Remove artifact from status.csv files and return affected software names. + + Args: + artifact_name: Name of the artifact to remove + artifact_type: Type of artifact (git, tarball, pip_module) + base_path: Base path for status files + logger: Logger instance + + Returns: + List of software names that were affected + """ + affected_software = [] + try: + for arch in ARCH_SUFFIXES: + for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): + rows = [] + removed = False + with open(status_file, 'r') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + name = row.get('name', '') + row_type = row.get('type', '') + + if name == artifact_name and row_type == artifact_type: + removed = True + logger.info(f"Removing {artifact_type} '{name}' from {status_file}") + else: + rows.append(row) + + # # Match logic based on type + # should_remove = False + # if artifact_type == 'image': + # # Container images: match with or without tag + # should_remove = (name == artifact_name or name.startswith(f"{artifact_name}:")) + # else: + # # Other types: exact match + # should_remove = (name == artifact_name) + + # if should_remove: + # removed = True + # logger.info(f"Removing '{name}' from {status_file}") + # else: + # rows.append(row) + + if removed and fieldnames: + with open(status_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + # Track affected software + software_name = os.path.basename(os.path.dirname(status_file)) + if software_name not in affected_software: + affected_software.append(software_name) + + return affected_software + except Exception as e: + logger.error(f"Failed to remove from status files: {e}") + return [] + + +def mark_software_partial(software_names: List[str], base_path: str, logger): + """Mark software entries as partial in software.csv. + + Args: + software_names: List of software names to mark as partial + base_path: Base path for software.csv + logger: Logger instance + """ + if not software_names: + return + + try: + for arch in ARCH_SUFFIXES: + software_file = f"{base_path}/{arch}/software.csv" + if not os.path.exists(software_file): + continue + + rows = [] + with open(software_file, 'r') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + if row.get('name') in software_names: + row['status'] = 'partial' + logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in software.csv") + rows.append(row) + + if fieldnames and rows: + with open(software_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + except Exception as e: + logger.error(f"Failed to update software.csv: {e}") + + +def write_cleanup_status(results: List[Dict], base_path: str): + """Write cleanup results to status file.""" + status_file = f"{base_path}/cleanup_status.csv" + os.makedirs(os.path.dirname(status_file), exist_ok=True) + + with open(status_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=['name', 'type', 'status', 'message']) + writer.writeheader() + writer.writerows(results) + + return status_file + + +# ============================================================================= +# MAIN MODULE +# ============================================================================= + +def run_module(): + """Main module execution.""" + module = AnsibleModule( + argument_spec=dict( + cleanup_repos=dict(type='list', elements='str', default=[]), + cleanup_containers=dict(type='list', elements='str', default=[]), + cleanup_files=dict(type='list', elements='str', default=[]), + base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT) + ), + supports_check_mode=True + ) + + cleanup_repos = module.params['cleanup_repos'] + cleanup_containers = module.params['cleanup_containers'] + cleanup_files = module.params['cleanup_files'] + base_path = module.params['base_path'] + + # Setup logger - setup_standard_logger expects a directory, creates standard.log inside + log_dir = os.path.join(base_path, "cleanup") + os.makedirs(base_path, exist_ok=True) + logger = setup_standard_logger(log_dir) + + logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}") + + all_results = [] + + # Process repositories + for repo in cleanup_repos: + result = cleanup_repository(repo, base_path, logger) + all_results.append(result) + logger.info(f"Repository {repo}: {result['status']} - {result['message']}") + + # Process containers + for container in cleanup_containers: + result = cleanup_container(container, base_path, logger) + all_results.append(result) + logger.info(f"Container {container}: {result['status']} - {result['message']}") + + # Process files + for file in cleanup_files: + result = cleanup_file(file, base_path, logger) + all_results.append(result) + logger.info(f"File {file}: {result['status']} - {result['message']}") + + # Write status file + status_file = write_cleanup_status(all_results, base_path) + + # Calculate summary + total = len(all_results) + success = len([r for r in all_results if r['status'] == 'Success']) + failed = len([r for r in all_results if r['status'] == 'Failed']) + + # Generate pretty table + pretty_table = format_pretty_table(all_results) + + logger.info(f"Cleanup completed - Total: {total}, Success: {success}, Failed: {failed}") + + module.exit_json( + changed=success > 0, + results=all_results, + total=total, + success_count=success, + failed_count=failed, + summary=f"Total: {total}, Success: {success}, Failed: {failed}", + pretty_table=pretty_table, + pretty_table_lines=pretty_table.split('\n'), + status_file=status_file + ) + + +if __name__ == '__main__': + run_module() diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml new file mode 100644 index 0000000000..c07a8ef7b0 --- /dev/null +++ b/local_repo/pulp_cleanup.yml @@ -0,0 +1,96 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Pulp Cleanup Playbook - Clean Architecture +# +# Usage: +# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel", "baseos"]}' +# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_containers": ["nginx", "redis"]}' +# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_files": ["git", "chart-0.48.0"]}' +# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel"], "cleanup_containers": ["nginx"]}' -e force=true + +- name: Pulp Cleanup + hosts: localhost + connection: local + gather_facts: false + + pre_tasks: + # Step 1: Input Validation + - name: Validate input - at least one cleanup type must be specified + ansible.builtin.assert: + that: + - (cleanup_repos | default([]) | length > 0) or (cleanup_containers | default([]) | length > 0) or (cleanup_files | default([]) | length > 0) + fail_msg: | + No cleanup items specified. Please provide at least one of: + cleanup_repos: ['repo1', 'repo2'] + cleanup_containers: ['container1', 'container2'] + cleanup_files: ['file1', 'file2'] + + # Step 2: User Confirmation + - name: Parse cleanup lists + ansible.builtin.set_fact: + repo_list: "{{ (cleanup_repos.split(',') if cleanup_repos != 'all' else []) if cleanup_repos is defined else [] }}" + container_list: "{{ (cleanup_containers.split(',') if cleanup_containers is string else cleanup_containers) | default([]) }}" + file_list: "{{ (cleanup_files.split(',') if cleanup_files is string else cleanup_files) | default([]) }}" + + - name: Display cleanup summary + ansible.builtin.debug: + msg: + - "========== CLEANUP SUMMARY ==========" + #- "Repositories : {{ (cleanup_repos | default([]) | join(', ')) if cleanup_repos | default([]) | length > 0 else 'None' }}" + - "Repositories : {{ (repo_list | default([]) | join(', ')) if repo_list | default([]) | length > 0 else 'None' }}" + - "Containers : {{ (container_list | default([]) | join(', ')) if cleanup_containers | default([]) | length > 0 else 'None' }}" + - "Files : {{ (file_list | default([]) | join(', ')) if cleanup_files | default([]) | length > 0 else 'None' }}" + - "=====================================" + - name: Get user confirmation + ansible.builtin.pause: + prompt: | + + ⚠️ WARNING: This will permanently delete the specified artifacts. + This action cannot be undone. + + Type 'yes' to continue or press Ctrl+C to abort + register: user_input + when: not (force | default(false)) | bool + + - name: Abort if not confirmed + ansible.builtin.fail: + msg: "Cleanup cancelled by user" + when: + - not (force | default(false)) | bool + - user_input.user_input | default('') | lower != 'yes' + + tasks: + # Step 3: Call Python Module + - name: Execute cleanup + pulp_cleanup: + cleanup_repos: "{{ repo_list | default([]) }}" + cleanup_containers: "{{ container_list | default([]) }}" + cleanup_files: "{{ file_list | default([]) }}" + register: cleanup_result + + post_tasks: + # Step 4: Display Results + - name: Display cleanup results + ansible.builtin.debug: + msg: "{{ cleanup_result.pretty_table_lines }}" + + - name: Display summary + ansible.builtin.debug: + msg: + - "========== CLEANUP COMPLETED ==========" + - "Total: {{ cleanup_result.total }}, Success: {{ cleanup_result.success_count }}, Failed: {{ cleanup_result.failed_count }}" + - "Status file: {{ cleanup_result.status_file }}" + - "========================================" + \ No newline at end of file From c3874566676cc9fa781008c4d4c1c47f442b5ba2 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Wed, 4 Feb 2026 14:25:59 +0530 Subject: [PATCH 017/172] ansible lint fixes Signed-off-by: pullan1 --- local_repo/pulp_cleanup.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index c07a8ef7b0..123b3a481f 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -48,7 +48,6 @@ ansible.builtin.debug: msg: - "========== CLEANUP SUMMARY ==========" - #- "Repositories : {{ (cleanup_repos | default([]) | join(', ')) if cleanup_repos | default([]) | length > 0 else 'None' }}" - "Repositories : {{ (repo_list | default([]) | join(', ')) if repo_list | default([]) | length > 0 else 'None' }}" - "Containers : {{ (container_list | default([]) | join(', ')) if cleanup_containers | default([]) | length > 0 else 'None' }}" - "Files : {{ (file_list | default([]) | join(', ')) if cleanup_files | default([]) | length > 0 else 'None' }}" @@ -84,7 +83,7 @@ # Step 4: Display Results - name: Display cleanup results ansible.builtin.debug: - msg: "{{ cleanup_result.pretty_table_lines }}" + msg: "{{ cleanup_result.pretty_table_lines }}" - name: Display summary ansible.builtin.debug: @@ -93,4 +92,3 @@ - "Total: {{ cleanup_result.total }}, Success: {{ cleanup_result.success_count }}, Failed: {{ cleanup_result.failed_count }}" - "Status file: {{ cleanup_result.status_file }}" - "========================================" - \ No newline at end of file From 675af6eff9caf49f0d3d38697a90d912d0200475 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Wed, 4 Feb 2026 12:47:51 +0000 Subject: [PATCH 018/172] fix for multiple user registries Signed-off-by: Vrinda_Marwah --- .../input_validation/schema/local_repo_config.json | 8 +++----- .../module_utils/local_repo/user_image_utility.py | 10 +++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/local_repo_config.json b/common/library/module_utils/input_validation/schema/local_repo_config.json index 63d61f0a31..e44cf44df7 100644 --- a/common/library/module_utils/input_validation/schema/local_repo_config.json +++ b/common/library/module_utils/input_validation/schema/local_repo_config.json @@ -17,17 +17,15 @@ }, "cert_path": { "type": "string", - "pattern": "^[a-zA-Z0-9/\\._-]*\\.crt$" + "pattern": "^$|^[a-zA-Z0-9/\\._-]*\\.crt$" }, "key_path": { "type": "string", - "pattern": "^[a-zA-Z0-9/\\._-]*\\.key$" + "pattern": "^$|^[a-zA-Z0-9/\\._-]*\\.key$" } }, "required": [ - "host", - "cert_path", - "key_path" + "host" ], "allOf": [ { diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py index 2cbe1cba2d..e97e9411dd 100644 --- a/common/library/module_utils/local_repo/user_image_utility.py +++ b/common/library/module_utils/local_repo/user_image_utility.py @@ -395,11 +395,11 @@ def handle_user_image_registry(package, package_content, version_variables, user logger.info(f"Image '{image_name}:{tag_val}' found in registry '{host}'") result, package_info = process_user_registry(package, host, package_content, version_variables, cacert, key, logger) break - - elif not image_found: - logger.info(f"Image '{image_name}:{tag_val}' not found in registry '{host}'") - result = False - break + else: + logger.info(f"Image '{image_name}:{tag_val}' not found in registry '{host}', checking next registry...") + else: + logger.info(f"Image '{image_name}:{tag_val}' not found in any user registry") + result = False except Exception as e: logger.error(f"Exception in {handle_user_image_registry.__name__}: {e}") From e11eebd8ce6501c487094db88ef5e8100a33ae02 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Wed, 4 Feb 2026 20:11:32 +0530 Subject: [PATCH 019/172] status file update after cleanup Signed-off-by: pullan1 --- common/library/modules/pulp_cleanup.py | 109 +++++++++++++------------ 1 file changed, 59 insertions(+), 50 deletions(-) diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index 10c43ca0e9..91b863144a 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -258,7 +258,7 @@ def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]: # Update status files - remove RPM entries from this repo and mark software as partial affected = remove_rpms_from_repository(name, base_path, logger) logger.info(f" mark affected softwares as partial {affected}") - mark_software_partial(affected, base_path, logger) + mark_software_partial(affected, base_path, logger, 'repository') else: result["message"] = f"Delete failed: {del_result['stderr']}" @@ -307,7 +307,7 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] result["message"] = "Container deleted" # Update status files - remove image entries and mark software as partial affected = remove_from_status_files(user_input, 'image', base_path, logger) - mark_software_partial(affected, base_path, logger) + mark_software_partial(affected, base_path, logger, 'image') else: result["message"] = f"Delete failed: {del_result['stderr']}" @@ -457,7 +457,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: affected = remove_from_status_files(name, 'pip_module', base_path, logger) if affected: messages.append("Status files updated") - mark_software_partial(affected, base_path, logger) + mark_software_partial(affected, base_path, logger, 'pip_module') if pulp_deleted: result["status"] = "Success" @@ -549,7 +549,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - if affected: status_removed = True messages.append("Status files updated") - mark_software_partial(affected, base_path, logger) + mark_software_partial(affected, base_path, logger, file_type) # Determine overall result if pulp_deleted or status_removed: @@ -647,8 +647,8 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[ logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}") return [] -def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> List[str]: - """Remove artifact from status.csv files and return affected software names. +def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]: + """Remove artifact from status.csv files and return affected software names by architecture. Args: artifact_name: Name of the artifact to remove @@ -657,11 +657,12 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: logger: Logger instance Returns: - List of software names that were affected + Dict mapping architecture to list of affected software names """ - affected_software = [] + affected_software = {} try: for arch in ARCH_SUFFIXES: + arch_affected = [] for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): rows = [] removed = False @@ -671,27 +672,20 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: for row in reader: name = row.get('name', '') row_type = row.get('type', '') + # Match logic based on type + should_remove = False + if artifact_type == 'image': + # Container images: match with or without tag + should_remove = (name == artifact_name or name.startswith(f"{artifact_name}:")) + else: + # Other types: exact match + should_remove = (name == artifact_name) - if name == artifact_name and row_type == artifact_type: + if should_remove: removed = True - logger.info(f"Removing {artifact_type} '{name}' from {status_file}") + logger.info(f"Removing '{name}' from {status_file}") else: rows.append(row) - - # # Match logic based on type - # should_remove = False - # if artifact_type == 'image': - # # Container images: match with or without tag - # should_remove = (name == artifact_name or name.startswith(f"{artifact_name}:")) - # else: - # # Other types: exact match - # should_remove = (name == artifact_name) - - # if should_remove: - # removed = True - # logger.info(f"Removing '{name}' from {status_file}") - # else: - # rows.append(row) if removed and fieldnames: with open(status_file, 'w', newline='') as f: @@ -701,47 +695,62 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: # Track affected software software_name = os.path.basename(os.path.dirname(status_file)) - if software_name not in affected_software: - affected_software.append(software_name) + if software_name not in arch_affected: + arch_affected.append(software_name) + + if arch_affected: + affected_software[arch] = arch_affected + logger.info(f"remove_from_status_files returning: {affected_software}") return affected_software except Exception as e: logger.error(f"Failed to remove from status files: {e}") - return [] + return {} -def mark_software_partial(software_names: List[str], base_path: str, logger): +def mark_software_partial(affected_software: Dict[str, List[str]], base_path: str, logger, artifact_type: str = None): """Mark software entries as partial in software.csv. Args: - software_names: List of software names to mark as partial + affected_software: Dict mapping architecture to list of affected software names base_path: Base path for software.csv logger: Logger instance + artifact_type: Type of artifact being removed (for logging purposes) """ - if not software_names: + logger.info(f"mark_software_partial called with affected_software: {affected_software}") + if not affected_software: + logger.info("No affected software to mark as partial") return try: - for arch in ARCH_SUFFIXES: - software_file = f"{base_path}/{arch}/software.csv" - if not os.path.exists(software_file): + # Only mark architectures where artifacts were actually removed + for arch, software_names in affected_software.items(): + logger.info(f"Processing arch: {arch}, software_names: {software_names}") + if not software_names: continue - - rows = [] - with open(software_file, 'r') as f: - reader = csv.DictReader(f) - fieldnames = reader.fieldnames - for row in reader: - if row.get('name') in software_names: - row['status'] = 'partial' - logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in software.csv") - rows.append(row) - - if fieldnames and rows: - with open(software_file, 'w', newline='') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) + + software_file = f"{base_path}/{arch}/software.csv" + logger.info(f"Looking for software file: {software_file}") + if os.path.exists(software_file): + rows = [] + updated = False + with open(software_file, 'r') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + logger.info(f"Checking row: {row}") + if row.get('name') in software_names: + row['status'] = 'partial' + updated = True + logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in {arch}/software.csv ({artifact_type} cleanup)") + rows.append(row) + + if fieldnames and rows: + with open(software_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + logger.info(f"Successfully wrote updated software.csv for {arch}") except Exception as e: logger.error(f"Failed to update software.csv: {e}") From 329737f52107174ff791f25eb759eaf0d31492fd Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Wed, 4 Feb 2026 15:01:33 +0000 Subject: [PATCH 020/172] Fix aarch64 base image package fact name --- .../roles/fetch_packages/tasks/fetch_packages.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml b/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml index e5bc523294..40c6b1092c 100644 --- a/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml +++ b/build_image_aarch64/roles/fetch_packages/tasks/fetch_packages.yml @@ -24,9 +24,9 @@ software_config_path: "{{ software_config_file_path }}" register: base_image_output - - name: Set x86_64_base_image_packages + - name: Set aarch_64_base_image_packages ansible.builtin.set_fact: - x86_64_base_image_packages: "{{ base_image_output.base_image_packages }}" + aarch64_base_image_packages: "{{ base_image_output.base_image_packages }}" - name: Debug package aarch64_base_image_packages ansible.builtin.debug: From c9172e9a431d09122ae507629c017156786c4950 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 20:47:47 +0530 Subject: [PATCH 021/172] victoria connect details --- utils/external_victoria_connect_details.yml | 21 +++ .../tasks/main.yml | 165 ++++++++++++++++++ .../vars/main.yml | 18 ++ 3 files changed, 204 insertions(+) create mode 100644 utils/external_victoria_connect_details.yml create mode 100644 utils/roles/external_victoria_connect_details/tasks/main.yml create mode 100644 utils/roles/external_victoria_connect_details/vars/main.yml diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml new file mode 100644 index 0000000000..3d29b4f720 --- /dev/null +++ b/utils/external_victoria_connect_details.yml @@ -0,0 +1,21 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Fetch external Victoria connection details + hosts: service_kube_control_plane + connection: ssh + gather_facts: false + roles: + - external_victoria_connect_details diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml new file mode 100644 index 0000000000..9230879781 --- /dev/null +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -0,0 +1,165 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check kubectl presence + ansible.builtin.command: kubectl version --client=true + register: kubectl_check + changed_when: false + failed_when: kubectl_check.rc != 0 + +- name: Get Victoria pods status + ansible.builtin.command: >- + kubectl get pods -n {{ victoria_namespace }} + -l app in (vminsert,vmselect,vmstorage,victoriametrics) + -o wide + register: victoria_pods + changed_when: false + failed_when: victoria_pods.rc != 0 + +- name: Get vminsert service LoadBalancer IP + ansible.builtin.command: >- + kubectl get svc vminsert -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: vminsert_lb_ip + changed_when: false + failed_when: false + +- name: Get vminsert service LoadBalancer hostname + ansible.builtin.command: >- + kubectl get svc vminsert -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' + register: vminsert_lb_hostname + changed_when: false + failed_when: false + +- name: Get vminsert service external port + ansible.builtin.command: >- + kubectl get svc vminsert -n {{ victoria_namespace }} + -o jsonpath='{.spec.ports[0].port}' + register: vminsert_lb_port + changed_when: false + failed_when: false + +- name: Get vmselect service LoadBalancer IP + ansible.builtin.command: >- + kubectl get svc vmselect -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: vmselect_lb_ip + changed_when: false + failed_when: false + +- name: Get vmselect service LoadBalancer hostname + ansible.builtin.command: >- + kubectl get svc vmselect -n {{ victoria_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' + register: vmselect_lb_hostname + changed_when: false + failed_when: false + +- name: Get vmselect service external port + ansible.builtin.command: >- + kubectl get svc vmselect -n {{ victoria_namespace }} + -o jsonpath='{.spec.ports[0].port}' + register: vmselect_lb_port + changed_when: false + failed_when: false + +- name: Set endpoint facts + ansible.builtin.set_fact: + vminsert_host: >- + {{ + (vminsert_lb_ip.stdout | trim) + if (vminsert_lb_ip.stdout | trim | length) > 0 + else (vminsert_lb_hostname.stdout | trim) + }} + vmselect_host: >- + {{ + (vmselect_lb_ip.stdout | trim) + if (vmselect_lb_ip.stdout | trim | length) > 0 + else (vmselect_lb_hostname.stdout | trim) + }} + vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}" + vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}" + victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt" + victoria_tls_cert: "{{ victoria_tls_cert_dir }}/server.crt" + victoria_tls_key: "{{ victoria_tls_cert_dir }}/server.key" + +- name: Fail when LoadBalancer IPs are not available + ansible.builtin.fail: + msg: >- + Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' + exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. + when: + - vminsert_host | trim | length == 0 or vmselect_host | trim | length == 0 + +- name: Set Victoria external port fallbacks + ansible.builtin.set_fact: + vminsert_port: "8480" + vmselect_port: "8481" + when: + - vminsert_port | trim | length == 0 or vmselect_port | trim | length == 0 + +- name: Build connection details + ansible.builtin.set_fact: + victoria_connect_details: + victoria: + namespace: "{{ victoria_namespace }}" + pod_status: "{{ victoria_pods.stdout }}" + base_url: "https://{{ vminsert_host }}:{{ vminsert_port }}" + endpoints: + vminsert: + host: "{{ vminsert_host }}" + port: "{{ vminsert_port | int }}" + write_endpoint: "https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write" + vmselect: + host: "{{ vmselect_host }}" + port: "{{ vmselect_port | int }}" + query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" + tls: + ca_crt: "{{ victoria_tls_ca }}" + server_crt: "{{ victoria_tls_cert }}" + server_key: "{{ victoria_tls_key }}" + +- name: Ensure output directory exists + ansible.builtin.file: + path: "{{ victoria_output_file | dirname }}" + state: directory + mode: "0755" + delegate_to: localhost + connection: local + run_once: true + +- name: Write connection details to file + ansible.builtin.copy: + content: "{{ victoria_connect_details | to_nice_yaml }}" + dest: "{{ victoria_output_file }}" + mode: "0644" + delegate_to: localhost + connection: local + run_once: true + +- name: Display Victoria connection details + ansible.builtin.debug: + msg: + - "Victoria connection details written to: {{ victoria_output_file }}" + - "vminsert: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write" + - "vmselect: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" + - "TLS CA: {{ victoria_tls_ca }}" + - "TLS cert: {{ victoria_tls_cert }}" + - "TLS key: {{ victoria_tls_key }}" + - "Pods:\n{{ victoria_pods.stdout }}" + delegate_to: localhost + connection: local + run_once: true diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml new file mode 100644 index 0000000000..29db9136f2 --- /dev/null +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -0,0 +1,18 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +victoria_namespace: "telemetry" +victoria_output_file: "/opt/omnia/telemetry/external_victoria_connect_details.yml" +victoria_tls_cert_dir: "/opt/omnia/telemetry/victoria-certs" From d4ee4628bd1e6bc95544b551d4ce33d1fd7441f4 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 20:53:38 +0530 Subject: [PATCH 022/172] Update main.yml --- utils/roles/external_victoria_connect_details/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 9230879781..9e194436b7 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -20,9 +20,9 @@ failed_when: kubectl_check.rc != 0 - name: Get Victoria pods status - ansible.builtin.command: >- + ansible.builtin.shell: >- kubectl get pods -n {{ victoria_namespace }} - -l app in (vminsert,vmselect,vmstorage,victoriametrics) + -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" -o wide register: victoria_pods changed_when: false From aabcbb1d47aad173edef09216378b8c98bdab43a Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 21:07:59 +0530 Subject: [PATCH 023/172] Update main.yml --- .../tasks/main.yml | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 9e194436b7..bdcb15ca0b 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -127,10 +127,9 @@ host: "{{ vmselect_host }}" port: "{{ vmselect_port | int }}" query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" + ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" tls: - ca_crt: "{{ victoria_tls_ca }}" server_crt: "{{ victoria_tls_cert }}" - server_key: "{{ victoria_tls_key }}" - name: Ensure output directory exists ansible.builtin.file: @@ -152,14 +151,19 @@ - name: Display Victoria connection details ansible.builtin.debug: - msg: - - "Victoria connection details written to: {{ victoria_output_file }}" - - "vminsert: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write" - - "vmselect: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" - - "TLS CA: {{ victoria_tls_ca }}" - - "TLS cert: {{ victoria_tls_cert }}" - - "TLS key: {{ victoria_tls_key }}" - - "Pods:\n{{ victoria_pods.stdout }}" + msg: | + Victoria connection details written to: {{ victoria_output_file }} + + Endpoints: + vminsert write: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write + vmselect query: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query + vmselect UI: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui + + TLS: + server.crt: {{ victoria_tls_cert }} + + Pods: + {{ victoria_pods.stdout }} delegate_to: localhost connection: local run_once: true From 9bd5250b26e756c1c659aae85313ab7387013a4a Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 21:11:22 +0530 Subject: [PATCH 024/172] Update external_victoria_connect_details.yml --- utils/external_victoria_connect_details.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml index 3d29b4f720..ad4ed542df 100644 --- a/utils/external_victoria_connect_details.yml +++ b/utils/external_victoria_connect_details.yml @@ -13,6 +13,19 @@ # limitations under the License. --- +- name: Preflight - validate inventory + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Fail if service_kube_control_plane group is missing or empty + ansible.builtin.fail: + msg: >- + Inventory must define a non-empty 'service_kube_control_plane' group. + Run with '-i ' and ensure at least one host is in that group. + when: + - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0 + - name: Fetch external Victoria connection details hosts: service_kube_control_plane connection: ssh From 01c213ba7b9ce758fc41d7bb388b1a8da357cd63 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 21:26:25 +0530 Subject: [PATCH 025/172] Update main.yml --- .../tasks/main.yml | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index bdcb15ca0b..d1b0286d76 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -151,19 +151,22 @@ - name: Display Victoria connection details ansible.builtin.debug: - msg: | - Victoria connection details written to: {{ victoria_output_file }} - - Endpoints: - vminsert write: https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write - vmselect query: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query - vmselect UI: https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui - - TLS: - server.crt: {{ victoria_tls_cert }} - - Pods: - {{ victoria_pods.stdout }} + msg: >- + {{ + [ + 'Victoria connection details written to: ' ~ victoria_output_file, + '', + 'Endpoints:', + ' vminsert write: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write', + ' vmselect query: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/prometheus/api/v1/query', + ' vmselect UI: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui', + '', + 'TLS:', + ' server.crt: ' ~ victoria_tls_cert, + '', + 'Pods:' + ] + (victoria_pods.stdout_lines | default([])) + }} delegate_to: localhost connection: local run_once: true From e7887b800ed44bfe3e5b6a9d4110c1a3a7afe39c Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 21:32:39 +0530 Subject: [PATCH 026/172] Update main.yml --- .../tasks/main.yml | 105 +++++++++++++++--- 1 file changed, 92 insertions(+), 13 deletions(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index d1b0286d76..851528e9eb 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -19,14 +19,92 @@ changed_when: false failed_when: kubectl_check.rc != 0 +- name: Check for Victoria cluster services + ansible.builtin.command: >- + kubectl get svc {{ item }} -n {{ victoria_namespace }} -o name + loop: + - vminsert + - vmselect + register: victoria_cluster_svcs + changed_when: false + failed_when: false + +- name: Check for Victoria single-node service + ansible.builtin.command: >- + kubectl get svc victoria-loadbalancer -n {{ victoria_namespace }} -o name + register: victoria_single_svc + changed_when: false + failed_when: false + +- name: Set Victoria deployment mode + ansible.builtin.set_fact: + victoria_deployment_mode: >- + {{ + 'cluster' + if (victoria_cluster_svcs.results | selectattr('rc', 'equalto', 0) | list | length) == 2 + else ('single-node' if victoria_single_svc.rc == 0 else 'unknown') + }} + +- name: Fail if Victoria cluster mode is not deployed + ansible.builtin.fail: + msg: >- + Victoria deployment mode detected: {{ victoria_deployment_mode }}. + External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage). + Single-node Victoria (victoria-loadbalancer) is not supported for external integration. + when: victoria_deployment_mode != 'cluster' + - name: Get Victoria pods status ansible.builtin.shell: >- kubectl get pods -n {{ victoria_namespace }} -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" -o wide - register: victoria_pods + register: victoria_pods_wide + changed_when: false + failed_when: victoria_pods_wide.rc != 0 + +- name: Get Victoria pods status (json) + ansible.builtin.shell: >- + kubectl get pods -n {{ victoria_namespace }} + -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" + -o json + register: victoria_pods_json changed_when: false - failed_when: victoria_pods.rc != 0 + failed_when: victoria_pods_json.rc != 0 + +- name: Parse Victoria pods + ansible.builtin.set_fact: + victoria_pods_parsed: "{{ victoria_pods_json.stdout | from_json }}" + +- name: Fail if no Victoria pods found + ansible.builtin.fail: + msg: "No Victoria pods found in namespace '{{ victoria_namespace }}'." + when: (victoria_pods_parsed.items | default([]) | length) == 0 + +- name: Fail if Victoria pods are not Running + ansible.builtin.fail: + msg: "One or more Victoria pods are not in Running state." + when: >- + {{ + (victoria_pods_parsed.items | default([]) + | selectattr('status.phase', 'ne', 'Running') + | list + | length) > 0 + }} + +- name: Fail if Victoria pods are not Ready + ansible.builtin.fail: + msg: "One or more Victoria pods are not Ready." + when: >- + {{ + (victoria_pods_parsed.items | default([]) + | selectattr('status.containerStatuses', 'defined') + | map(attribute='status.containerStatuses') + | list + | flatten + | selectattr('ready', 'equalto', false) + | list + | length) > 0 + }} - name: Get vminsert service LoadBalancer IP ansible.builtin.command: >- @@ -34,7 +112,7 @@ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' register: vminsert_lb_ip changed_when: false - failed_when: false + failed_when: vminsert_lb_ip.rc != 0 - name: Get vminsert service LoadBalancer hostname ansible.builtin.command: >- @@ -42,7 +120,7 @@ -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' register: vminsert_lb_hostname changed_when: false - failed_when: false + failed_when: vminsert_lb_hostname.rc != 0 - name: Get vminsert service external port ansible.builtin.command: >- @@ -50,7 +128,7 @@ -o jsonpath='{.spec.ports[0].port}' register: vminsert_lb_port changed_when: false - failed_when: false + failed_when: vminsert_lb_port.rc != 0 - name: Get vmselect service LoadBalancer IP ansible.builtin.command: >- @@ -58,7 +136,7 @@ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' register: vmselect_lb_ip changed_when: false - failed_when: false + failed_when: vmselect_lb_ip.rc != 0 - name: Get vmselect service LoadBalancer hostname ansible.builtin.command: >- @@ -66,7 +144,7 @@ -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' register: vmselect_lb_hostname changed_when: false - failed_when: false + failed_when: vmselect_lb_hostname.rc != 0 - name: Get vmselect service external port ansible.builtin.command: >- @@ -74,7 +152,7 @@ -o jsonpath='{.spec.ports[0].port}' register: vmselect_lb_port changed_when: false - failed_when: false + failed_when: vmselect_lb_port.rc != 0 - name: Set endpoint facts ansible.builtin.set_fact: @@ -116,7 +194,8 @@ victoria_connect_details: victoria: namespace: "{{ victoria_namespace }}" - pod_status: "{{ victoria_pods.stdout }}" + deployment_mode: "{{ victoria_deployment_mode }}" + pod_status: "{{ victoria_pods_wide.stdout }}" base_url: "https://{{ vminsert_host }}:{{ vminsert_port }}" endpoints: vminsert: @@ -156,16 +235,16 @@ [ 'Victoria connection details written to: ' ~ victoria_output_file, '', + 'Mode: ' ~ victoria_deployment_mode, + '', 'Endpoints:', ' vminsert write: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write', ' vmselect query: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/prometheus/api/v1/query', ' vmselect UI: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui', '', 'TLS:', - ' server.crt: ' ~ victoria_tls_cert, - '', - 'Pods:' - ] + (victoria_pods.stdout_lines | default([])) + ' server.crt: ' ~ victoria_tls_cert + ] }} delegate_to: localhost connection: local From ed8ef8fbb21c5165c7acb36856ad1f93bfabee38 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 21:35:51 +0530 Subject: [PATCH 027/172] Update main.yml --- .../roles/external_victoria_connect_details/tasks/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 851528e9eb..758719043c 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -78,14 +78,14 @@ - name: Fail if no Victoria pods found ansible.builtin.fail: msg: "No Victoria pods found in namespace '{{ victoria_namespace }}'." - when: (victoria_pods_parsed.items | default([]) | length) == 0 + when: (victoria_pods_parsed.get('items', []) | length) == 0 - name: Fail if Victoria pods are not Running ansible.builtin.fail: msg: "One or more Victoria pods are not in Running state." when: >- {{ - (victoria_pods_parsed.items | default([]) + (victoria_pods_parsed.get('items', []) | selectattr('status.phase', 'ne', 'Running') | list | length) > 0 @@ -96,7 +96,7 @@ msg: "One or more Victoria pods are not Ready." when: >- {{ - (victoria_pods_parsed.items | default([]) + (victoria_pods_parsed.get('items', []) | selectattr('status.containerStatuses', 'defined') | map(attribute='status.containerStatuses') | list From b8e806ecc665764533d41116558060e79b04e325 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 21:38:54 +0530 Subject: [PATCH 028/172] Update main.yml --- .../external_victoria_connect_details/tasks/main.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 758719043c..4ec7f0c901 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -83,20 +83,17 @@ - name: Fail if Victoria pods are not Running ansible.builtin.fail: msg: "One or more Victoria pods are not in Running state." - when: >- - {{ - (victoria_pods_parsed.get('items', []) + when: + - (victoria_pods_parsed.get('items', []) | selectattr('status.phase', 'ne', 'Running') | list | length) > 0 - }} - name: Fail if Victoria pods are not Ready ansible.builtin.fail: msg: "One or more Victoria pods are not Ready." - when: >- - {{ - (victoria_pods_parsed.get('items', []) + when: + - (victoria_pods_parsed.get('items', []) | selectattr('status.containerStatuses', 'defined') | map(attribute='status.containerStatuses') | list @@ -104,7 +101,6 @@ | selectattr('ready', 'equalto', false) | list | length) > 0 - }} - name: Get vminsert service LoadBalancer IP ansible.builtin.command: >- From 0262dd365f21b052b116eab7006859f46ad60e84 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 4 Feb 2026 21:40:53 +0530 Subject: [PATCH 029/172] The input for thew custom confs now exist on the core container --- discovery/roles/slurm_config/tasks/confs.yml | 44 +++++++++++++++++--- input/omnia_config.yml | 4 +- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index f3228fa460..e1b4e2d3ea 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -27,7 +27,8 @@ - name: Slurm dbd opts ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) - | combine({'slurmdbd': (apply_config['slurmdbd'] | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}" + | combine({'slurmdbd': (apply_config['slurmdbd'] + | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}" when: ctld_list - name: Check .conf files existence @@ -37,21 +38,52 @@ loop: "{{ ctld_list | product(conf_files | default([])) }}" register: ctld_conf_files +- name: Parse configs_input files from localhost (if they are paths) + slurm_conf: + op: parse + conf_name: "{{ item.key }}" + path: "{{ item.value }}" + delegate_to: localhost + loop: "{{ configs_input | default({}) | dict2items }}" + register: parsed_configs_input_results + when: + - configs_input is defined + - configs_input + - item.value is abs + +- name: Build parsed_configs_input dictionary from parsed files + ansible.builtin.set_fact: + parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.item.key: item.conf_dict}) }}" + loop: "{{ parsed_configs_input_results.results }}" + when: + - parsed_configs_input_results is defined + - not parsed_configs_input_results.skipped | default(false) + +- name: Add configs_input dicts that are already parsed + ansible.builtin.set_fact: + parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.key: item.value}) }}" + loop: "{{ configs_input | default({}) | dict2items }}" + when: + - configs_input is defined + - configs_input + - item.value is mapping + - name: Create lists for conf_merge ansible.builtin.set_fact: conf_merge_dict: "{{ conf_merge_dict | default({}) | combine({ - conf_set.item.1: ( - [apply_config[conf_set.item.1]] - + ([conf_set.stat.path] if conf_set.stat.exists else []) - + ([configs_input.get(conf_set.item.1)] if configs_input.get(conf_set.item.1) else []) + existing_conf_set.item.1: ( + [apply_config[existing_conf_set.item.1]] + + ([existing_conf_set.stat.path] if existing_conf_set.stat.exists else []) + + ([parsed_configs_input.get(existing_conf_set.item.1)] + if parsed_configs_input is defined and parsed_configs_input.get(existing_conf_set.item.1) else []) ) }) }}" loop: "{{ ctld_conf_files.results }}" loop_control: - loop_var: conf_set + loop_var: existing_conf_set register: prepared_conf_lists - name: Prepend ClusterName and SlurmctldHost to slurm conf sources diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 3c4b3dbc35..032fa77ce0 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -32,13 +32,12 @@ # : # or # Supply the configuration values directly as a key–value map -# Supply the absolute path to a custom configuration file on the OIM server +# Supply the absolute path to a custom configuration file # The conf files supported by slurm are # slurm # cgroup # slurmdbd # gres -# mpi # Thes files will be written into the slurm_config directory with .conf suffix slurm_cluster: @@ -62,7 +61,6 @@ slurm_cluster: # cgroup: /path/to/custom_cgroup.conf # slurmdbd: /path/to/custom_slurmdbd.conf # gres: /path/to/custom_gres.conf - # mpi: /path/to/custom_mpi.conf # ----------------------------SERVICE K8S------------------------------------------------------ # For service k8s cluster below parameters are required,(List) From a2dc5e132fd3f78e841a5a1088bce86d5ef4db9d Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Wed, 4 Feb 2026 21:45:24 +0530 Subject: [PATCH 030/172] kafka update --- utils/external_kafka_connect_details.yml | 34 +++ .../tasks/main.yml | 215 ++++++++++++++++++ .../vars/main.yml | 22 ++ 3 files changed, 271 insertions(+) create mode 100644 utils/external_kafka_connect_details.yml create mode 100644 utils/roles/external_kafka_connect_details/tasks/main.yml create mode 100644 utils/roles/external_kafka_connect_details/vars/main.yml diff --git a/utils/external_kafka_connect_details.yml b/utils/external_kafka_connect_details.yml new file mode 100644 index 0000000000..a51a75aa3f --- /dev/null +++ b/utils/external_kafka_connect_details.yml @@ -0,0 +1,34 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Preflight - validate inventory + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Fail if service_kube_control_plane group is missing or empty + ansible.builtin.fail: + msg: >- + Inventory must define a non-empty 'service_kube_control_plane' group. + Run with '-i ' and ensure at least one host is in that group. + when: + - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0 + +- name: Fetch external Kafka connection details + hosts: service_kube_control_plane + connection: ssh + gather_facts: false + roles: + - external_kafka_connect_details diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml new file mode 100644 index 0000000000..169c83bad3 --- /dev/null +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -0,0 +1,215 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check kubectl presence + ansible.builtin.command: kubectl version --client=true + register: kubectl_check + changed_when: false + failed_when: kubectl_check.rc != 0 + +- name: Get Kafka pod status + ansible.builtin.command: >- + kubectl get pods -n {{ kafka_namespace }} + -l app.kubernetes.io/name=kafka + -o wide + register: kafka_pods + changed_when: false + failed_when: false + +- name: Get Kafka pod status (json) + ansible.builtin.command: >- + kubectl get pods -n {{ kafka_namespace }} + -l app.kubernetes.io/name=kafka + -o json + register: kafka_pods_json + changed_when: false + failed_when: kafka_pods_json.rc != 0 + +- name: Parse Kafka pods + ansible.builtin.set_fact: + kafka_pods_parsed: "{{ kafka_pods_json.stdout | from_json }}" + +- name: Fail if no Kafka pods found + ansible.builtin.fail: + msg: "No Kafka pods found in namespace '{{ kafka_namespace }}'." + when: (kafka_pods_parsed.get('items', []) | length) == 0 + +- name: Fail if Kafka pods are not Running + ansible.builtin.fail: + msg: "One or more Kafka pods are not in Running state." + when: + - (kafka_pods_parsed.get('items', []) + | selectattr('status.phase', 'ne', 'Running') + | list + | length) > 0 + +- name: Fail if Kafka pods are not Ready + ansible.builtin.fail: + msg: "One or more Kafka pods are not Ready." + when: + - (kafka_pods_parsed.get('items', []) + | selectattr('status.containerStatuses', 'defined') + | map(attribute='status.containerStatuses') + | list + | flatten + | selectattr('ready', 'equalto', false) + | list + | length) > 0 + +- name: Get Kafka LoadBalancer IP + ansible.builtin.command: >- + kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: kafka_lb_ip + changed_when: false + failed_when: kafka_lb_ip.rc != 0 + +- name: Get Kafka LoadBalancer hostname + ansible.builtin.command: >- + kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }} + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' + register: kafka_lb_hostname + changed_when: false + failed_when: kafka_lb_hostname.rc != 0 + +- name: Get Kafka LoadBalancer external port + ansible.builtin.command: >- + kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }} + -o jsonpath='{.spec.ports[0].port}' + register: kafka_lb_port + changed_when: false + failed_when: kafka_lb_port.rc != 0 + +- name: Set Kafka external endpoint + ansible.builtin.set_fact: + kafka_external_host: >- + {{ + (kafka_lb_ip.stdout | trim) + if (kafka_lb_ip.stdout | trim | length) > 0 + else (kafka_lb_hostname.stdout | trim) + }} + kafka_external_port: "{{ (kafka_lb_port.stdout | trim) | default('') }}" + +- name: Fail when Kafka external endpoint is not available + ansible.builtin.fail: + msg: >- + Failed to fetch Kafka LoadBalancer endpoint. Ensure service '{{ kafka_lb_service_name }}' + exists in namespace '{{ kafka_namespace }}' and has an external IP/hostname assigned. + when: kafka_external_host | trim | length == 0 + +- name: Set Kafka external port fallback + ansible.builtin.set_fact: + kafka_external_port: "{{ kafka_bootstrap_port | string }}" + when: kafka_external_port | trim | length == 0 + +- name: Ensure output directory exists + ansible.builtin.file: + path: "{{ kafka_output_dir }}" + state: directory + mode: "0755" + delegate_to: localhost + connection: local + run_once: true + +- name: Read Kafka cluster CA cert from secret + ansible.builtin.command: >- + kubectl get secret {{ kafka_cluster_ca_secret }} -n {{ kafka_namespace }} + -o jsonpath='{.data.ca\.crt}' + register: kafka_ca_crt_b64 + changed_when: false + failed_when: kafka_ca_crt_b64.rc != 0 or (kafka_ca_crt_b64.stdout | trim | length == 0) + +- name: Read Kafka client cert from secret + ansible.builtin.command: >- + kubectl get secret {{ kafka_client_secret }} -n {{ kafka_namespace }} + -o jsonpath='{.data.user\.crt}' + register: kafka_user_crt_b64 + changed_when: false + failed_when: kafka_user_crt_b64.rc != 0 or (kafka_user_crt_b64.stdout | trim | length == 0) + +- name: Read Kafka client key from secret + ansible.builtin.command: >- + kubectl get secret {{ kafka_client_secret }} -n {{ kafka_namespace }} + -o jsonpath='{.data.user\.key}' + register: kafka_user_key_b64 + changed_when: false + failed_when: kafka_user_key_b64.rc != 0 or (kafka_user_key_b64.stdout | trim | length == 0) + +- name: Write Kafka CA/cert/key files + ansible.builtin.copy: + content: "{{ item.content }}" + dest: "{{ item.dest }}" + mode: "0600" + loop: + - dest: "{{ kafka_output_dir }}/ca.crt" + content: "{{ kafka_ca_crt_b64.stdout | b64decode }}" + - dest: "{{ kafka_output_dir }}/user.crt" + content: "{{ kafka_user_crt_b64.stdout | b64decode }}" + - dest: "{{ kafka_output_dir }}/user.key" + content: "{{ kafka_user_key_b64.stdout | b64decode }}" + delegate_to: localhost + connection: local + run_once: true + +- name: Build Kafka connection details + ansible.builtin.set_fact: + kafka_connect_details: + kafka: + namespace: "{{ kafka_namespace }}" + loadbalancer_service: "{{ kafka_lb_service_name }}" + pod_status: "{{ kafka_pods.stdout | default('') }}" + bootstrap_server: "{{ kafka_external_host }}:{{ kafka_external_port }}" + tls: + ca_crt: "{{ kafka_output_dir }}/ca.crt" + client_crt: "{{ kafka_output_dir }}/user.crt" + client_key: "{{ kafka_output_dir }}/user.key" + +- name: Ensure output file directory exists + ansible.builtin.file: + path: "{{ kafka_output_file | dirname }}" + state: directory + mode: "0755" + delegate_to: localhost + connection: local + run_once: true + +- name: Write Kafka connection details to file + ansible.builtin.copy: + content: "{{ kafka_connect_details | to_nice_yaml }}" + dest: "{{ kafka_output_file }}" + mode: "0644" + delegate_to: localhost + connection: local + run_once: true + +- name: Display Kafka connection details + ansible.builtin.debug: + msg: >- + {{ + [ + 'Kafka connection details written to: ' ~ kafka_output_file, + '', + 'Bootstrap: ' ~ kafka_external_host ~ ':' ~ kafka_external_port, + '', + 'TLS:', + ' CA: ' ~ kafka_output_dir ~ '/ca.crt', + ' client cert: ' ~ kafka_output_dir ~ '/user.crt', + ' client key: ' ~ kafka_output_dir ~ '/user.key', + '' + ] + }} + delegate_to: localhost + connection: local + run_once: true diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml new file mode 100644 index 0000000000..fd2455b550 --- /dev/null +++ b/utils/roles/external_kafka_connect_details/vars/main.yml @@ -0,0 +1,22 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +kafka_namespace: "telemetry" +kafka_lb_service_name: "kafka-kafka-external-bootstrap" +kafka_bootstrap_port: 9094 +kafka_cluster_ca_secret: "kafka-cluster-ca-cert" +kafka_client_secret: "kafkapump" +kafka_output_dir: "/opt/omnia/telemetry/external_kafka" +kafka_output_file: "/opt/omnia/telemetry/external_kafka_connect_details.yml" From bc4f61db34ab7c4da5a546c5a4b6820fbba2ed4b Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Thu, 5 Feb 2026 11:46:52 +0530 Subject: [PATCH 031/172] kafka and victoria update --- .../tasks/main.yml | 50 +++++-------------- .../vars/main.yml | 8 +++ .../tasks/main.yml | 15 ++---- .../vars/main.yml | 13 +++++ 4 files changed, 39 insertions(+), 47 deletions(-) diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index 169c83bad3..61d0811815 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -43,12 +43,12 @@ - name: Fail if no Kafka pods found ansible.builtin.fail: - msg: "No Kafka pods found in namespace '{{ kafka_namespace }}'." + msg: "{{ kafka_err_no_pods_found }}" when: (kafka_pods_parsed.get('items', []) | length) == 0 - name: Fail if Kafka pods are not Running ansible.builtin.fail: - msg: "One or more Kafka pods are not in Running state." + msg: "{{ kafka_err_pods_not_running }}" when: - (kafka_pods_parsed.get('items', []) | selectattr('status.phase', 'ne', 'Running') @@ -57,7 +57,7 @@ - name: Fail if Kafka pods are not Ready ansible.builtin.fail: - msg: "One or more Kafka pods are not Ready." + msg: "{{ kafka_err_pods_not_ready }}" when: - (kafka_pods_parsed.get('items', []) | selectattr('status.containerStatuses', 'defined') @@ -76,43 +76,15 @@ changed_when: false failed_when: kafka_lb_ip.rc != 0 -- name: Get Kafka LoadBalancer hostname - ansible.builtin.command: >- - kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }} - -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' - register: kafka_lb_hostname - changed_when: false - failed_when: kafka_lb_hostname.rc != 0 - -- name: Get Kafka LoadBalancer external port - ansible.builtin.command: >- - kubectl get svc {{ kafka_lb_service_name }} -n {{ kafka_namespace }} - -o jsonpath='{.spec.ports[0].port}' - register: kafka_lb_port - changed_when: false - failed_when: kafka_lb_port.rc != 0 - - name: Set Kafka external endpoint ansible.builtin.set_fact: - kafka_external_host: >- - {{ - (kafka_lb_ip.stdout | trim) - if (kafka_lb_ip.stdout | trim | length) > 0 - else (kafka_lb_hostname.stdout | trim) - }} - kafka_external_port: "{{ (kafka_lb_port.stdout | trim) | default('') }}" + kafka_external_ip: "{{ kafka_lb_ip.stdout | trim }}" + kafka_external_port: "{{ kafka_bootstrap_port | string }}" - name: Fail when Kafka external endpoint is not available ansible.builtin.fail: - msg: >- - Failed to fetch Kafka LoadBalancer endpoint. Ensure service '{{ kafka_lb_service_name }}' - exists in namespace '{{ kafka_namespace }}' and has an external IP/hostname assigned. - when: kafka_external_host | trim | length == 0 - -- name: Set Kafka external port fallback - ansible.builtin.set_fact: - kafka_external_port: "{{ kafka_bootstrap_port | string }}" - when: kafka_external_port | trim | length == 0 + msg: "{{ kafka_err_external_ip_missing }}" + when: kafka_external_ip | trim | length == 0 - name: Ensure output directory exists ansible.builtin.file: @@ -170,7 +142,7 @@ namespace: "{{ kafka_namespace }}" loadbalancer_service: "{{ kafka_lb_service_name }}" pod_status: "{{ kafka_pods.stdout | default('') }}" - bootstrap_server: "{{ kafka_external_host }}:{{ kafka_external_port }}" + bootstrap_server: "{{ kafka_external_ip }}:{{ kafka_external_port }}" tls: ca_crt: "{{ kafka_output_dir }}/ca.crt" client_crt: "{{ kafka_output_dir }}/user.crt" @@ -201,12 +173,16 @@ [ 'Kafka connection details written to: ' ~ kafka_output_file, '', - 'Bootstrap: ' ~ kafka_external_host ~ ':' ~ kafka_external_port, + 'Bootstrap: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, '', 'TLS:', ' CA: ' ~ kafka_output_dir ~ '/ca.crt', ' client cert: ' ~ kafka_output_dir ~ '/user.crt', ' client key: ' ~ kafka_output_dir ~ '/user.key', + '', + 'OME note (client cert):', + ' Create a certificate in .pfx format (provide a passphrase when prompted):', + ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', '' ] }} diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml index fd2455b550..d0bd070d47 100644 --- a/utils/roles/external_kafka_connect_details/vars/main.yml +++ b/utils/roles/external_kafka_connect_details/vars/main.yml @@ -20,3 +20,11 @@ kafka_cluster_ca_secret: "kafka-cluster-ca-cert" kafka_client_secret: "kafkapump" kafka_output_dir: "/opt/omnia/telemetry/external_kafka" kafka_output_file: "/opt/omnia/telemetry/external_kafka_connect_details.yml" + +kafka_err_no_pods_found: "No Kafka pods found in namespace '{{ kafka_namespace }}'." +kafka_err_pods_not_running: "One or more Kafka pods are not in Running state." +kafka_err_pods_not_ready: "One or more Kafka pods are not Ready." + +kafka_err_external_ip_missing: >- + Failed to fetch Kafka LoadBalancer external IP. Ensure service '{{ kafka_lb_service_name }}' + exists in namespace '{{ kafka_namespace }}' and has an external IP assigned. diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 4ec7f0c901..90dac54cca 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -47,10 +47,7 @@ - name: Fail if Victoria cluster mode is not deployed ansible.builtin.fail: - msg: >- - Victoria deployment mode detected: {{ victoria_deployment_mode }}. - External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage). - Single-node Victoria (victoria-loadbalancer) is not supported for external integration. + msg: "{{ victoria_err_mode_not_supported }}" when: victoria_deployment_mode != 'cluster' - name: Get Victoria pods status @@ -77,12 +74,12 @@ - name: Fail if no Victoria pods found ansible.builtin.fail: - msg: "No Victoria pods found in namespace '{{ victoria_namespace }}'." + msg: "{{ victoria_err_no_pods_found }}" when: (victoria_pods_parsed.get('items', []) | length) == 0 - name: Fail if Victoria pods are not Running ansible.builtin.fail: - msg: "One or more Victoria pods are not in Running state." + msg: "{{ victoria_err_pods_not_running }}" when: - (victoria_pods_parsed.get('items', []) | selectattr('status.phase', 'ne', 'Running') @@ -91,7 +88,7 @@ - name: Fail if Victoria pods are not Ready ansible.builtin.fail: - msg: "One or more Victoria pods are not Ready." + msg: "{{ victoria_err_pods_not_ready }}" when: - (victoria_pods_parsed.get('items', []) | selectattr('status.containerStatuses', 'defined') @@ -172,9 +169,7 @@ - name: Fail when LoadBalancer IPs are not available ansible.builtin.fail: - msg: >- - Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' - exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. + msg: "{{ victoria_err_lb_missing }}" when: - vminsert_host | trim | length == 0 or vmselect_host | trim | length == 0 diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml index 29db9136f2..ea1c083deb 100644 --- a/utils/roles/external_victoria_connect_details/vars/main.yml +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -16,3 +16,16 @@ victoria_namespace: "telemetry" victoria_output_file: "/opt/omnia/telemetry/external_victoria_connect_details.yml" victoria_tls_cert_dir: "/opt/omnia/telemetry/victoria-certs" + +victoria_err_mode_not_supported: >- + Victoria deployment mode detected: {{ victoria_deployment_mode }}. + External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage). + Single-node Victoria (victoria-loadbalancer) is not supported for external integration. + +victoria_err_no_pods_found: "No Victoria pods found in namespace '{{ victoria_namespace }}'." +victoria_err_pods_not_running: "One or more Victoria pods are not in Running state." +victoria_err_pods_not_ready: "One or more Victoria pods are not Ready." + +victoria_err_lb_missing: >- + Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' + exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. From d3f6f7e9cef0cae0579d8ea1cf2e49805afcf3f2 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Thu, 5 Feb 2026 12:29:16 +0530 Subject: [PATCH 032/172] update sfm and ome --- .../tasks/main.yml | 10 ++++++---- .../tasks/main.yml | 20 ++++++++++++++++++- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index 61d0811815..0c4d525a82 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -173,16 +173,18 @@ [ 'Kafka connection details written to: ' ~ kafka_output_file, '', - 'Bootstrap: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, + 'Kafka external endpoint: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, '', 'TLS:', ' CA: ' ~ kafka_output_dir ~ '/ca.crt', ' client cert: ' ~ kafka_output_dir ~ '/user.crt', ' client key: ' ~ kafka_output_dir ~ '/user.key', '', - 'OME note (client cert):', - ' Create a certificate in .pfx format (provide a passphrase when prompted):', - ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', + 'OME note (mTLS):', + ' Use ca.crt as the server certificate in OME.', + ' Create a client certificate in .pfx format (provide a passphrase when prompted):', + ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', + ' Use user.pfx as the client certificate in OME.', '' ] }} diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 90dac54cca..c44c145921 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -173,6 +173,15 @@ when: - vminsert_host | trim | length == 0 or vmselect_host | trim | length == 0 +- name: Build SFM hosts entry + ansible.builtin.set_fact: + victoria_sfm_hosts_entry: >- + {{ + 'echo "' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts' + if (vminsert_lb_ip.stdout | trim | length) > 0 + else '' + }} + - name: Set Victoria external port fallbacks ansible.builtin.set_fact: vminsert_port: "8480" @@ -200,6 +209,10 @@ ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" tls: server_crt: "{{ victoria_tls_cert }}" + notes: + sfm: + vminsert_write_url: "https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write" + hosts_entry: "{{ victoria_sfm_hosts_entry }}" - name: Ensure output directory exists ansible.builtin.file: @@ -234,7 +247,12 @@ ' vmselect UI: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui', '', 'TLS:', - ' server.crt: ' ~ victoria_tls_cert + ' server.crt: ' ~ victoria_tls_cert, + '', + 'SFM note:', + ' Use vminsert write URL for SFM: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write', + ' Add this entry to /etc/hosts on the SFM server:', + ' ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'LoadBalancer IP not available; cannot generate /etc/hosts entry.') ] }} delegate_to: localhost From 4a6a7d7fbca7bbaf4c1cadf2b9ec440e0f94d895 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 5 Feb 2026 13:19:53 +0530 Subject: [PATCH 033/172] When mix of path and map were provided, was causing issues --- discovery/roles/slurm_config/tasks/confs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index e1b4e2d3ea..33315709cc 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -49,7 +49,7 @@ when: - configs_input is defined - configs_input - - item.value is abs + - item.value is string - name: Build parsed_configs_input dictionary from parsed files ansible.builtin.set_fact: @@ -57,7 +57,7 @@ loop: "{{ parsed_configs_input_results.results }}" when: - parsed_configs_input_results is defined - - not parsed_configs_input_results.skipped | default(false) + - not item.skipped | default(false) - name: Add configs_input dicts that are already parsed ansible.builtin.set_fact: From 07758fa988e2c91768bff545ce4ed7aad53c01ff Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Thu, 5 Feb 2026 11:01:56 +0000 Subject: [PATCH 034/172] Updating additional_packages.json for aarch64 with service_k8s functional groups --- .../aarch64/rhel/10.0/additional_packages.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/input/config/aarch64/rhel/10.0/additional_packages.json b/input/config/aarch64/rhel/10.0/additional_packages.json index b01c3f78b5..0d6d9a0452 100644 --- a/input/config/aarch64/rhel/10.0/additional_packages.json +++ b/input/config/aarch64/rhel/10.0/additional_packages.json @@ -4,6 +4,21 @@ ] }, + "service_kube_control_plane_first": { + "cluster": [ + + ] + }, + "service_kube_control_plane": { + "cluster": [ + + ] + }, + "service_kube_node": { + "cluster": [ + + ] + }, "slurm_control_node": { "cluster": [ From 51349058819406ee0a9a2e2cac8594ea127c45a3 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 5 Feb 2026 17:14:58 +0530 Subject: [PATCH 035/172] Upgrade of network_spec.yml, software_config.json and pxe_mapping_file.csv --- .../import_input_parameters/tasks/main.yml | 9 + .../tasks/restore_pxe_mapping_file.yml | 49 +++++ .../tasks/restore_software_config.yml | 60 ++++++ .../tasks/transform_network_spec.yml | 192 ++++++++++++++++++ .../templates/network_spec.j2 | 61 +++++- .../import_input_parameters/vars/main.yml | 4 + 6 files changed, 365 insertions(+), 10 deletions(-) create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_software_config.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index f4c5b1b7cb..af45a1de1b 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -12,3 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +- name: Transform network_spec.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_network_spec.yml + +- name: Restore software_config.json from backup + ansible.builtin.include_tasks: restore_software_config.yml + +- name: Restore pxe_mapping_file.csv from backup + ansible.builtin.include_tasks: restore_pxe_mapping_file.yml diff --git a/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml b/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml new file mode 100644 index 0000000000..f468359305 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml @@ -0,0 +1,49 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate backup_location is provided + ansible.builtin.fail: + msg: "backup_location must be provided to restore pxe_mapping_file.csv" + when: backup_location is not defined or (backup_location | string | trim) == "" + +- name: Ensure backup directory exists + ansible.builtin.file: + path: "{{ backup_location }}" + state: directory + mode: '0755' + +- name: Check if backup pxe_mapping_file.csv exists + ansible.builtin.stat: + path: "{{ backup_location }}/pxe_mapping_file.csv" + register: backup_pxe_mapping_stat + +- name: Fail if backup pxe_mapping_file.csv is not present + ansible.builtin.fail: + msg: "Backup pxe_mapping_file.csv is not present at {{ backup_location }}/pxe_mapping_file.csv" + when: not backup_pxe_mapping_stat.stat.exists + +- name: Overwrite pxe_mapping_file.csv in input directory from backup + ansible.builtin.copy: + src: "{{ backup_location }}/pxe_mapping_file.csv" + dest: "{{ omnia_input_dir }}/pxe_mapping_file.csv" + mode: '0644' + remote_src: true + +- name: Display pxe_mapping_file.csv restore summary + ansible.builtin.debug: + msg: | + pxe_mapping_file.csv restored from backup. + Backup preserved at: {{ backup_location }}/pxe_mapping_file.csv + Restored to: {{ omnia_input_dir }}/pxe_mapping_file.csv diff --git a/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml b/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml new file mode 100644 index 0000000000..9891023702 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml @@ -0,0 +1,60 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate backup_location is provided + ansible.builtin.fail: + msg: "backup_location must be provided to restore software_config.json" + when: backup_location is not defined or (backup_location | string | trim) == "" + +- name: Ensure backup directory exists + ansible.builtin.file: + path: "{{ backup_location }}" + state: directory + mode: '0755' + +- name: Check if backup software_config.json exists + ansible.builtin.stat: + path: "{{ backup_location }}/software_config.json" + register: backup_software_config_stat + +- name: Fail if backup software_config.json is not present + ansible.builtin.fail: + msg: "Backup software_config.json is not present at {{ backup_location }}/software_config.json" + when: not backup_software_config_stat.stat.exists + +- name: Overwrite software_config.json in input directory from backup + ansible.builtin.copy: + src: "{{ backup_location }}/software_config.json" + dest: "{{ omnia_input_dir }}/software_config.json" + mode: '0644' + remote_src: true + +- name: Validate JSON syntax of software_config.json + ansible.builtin.command: + cmd: python3 -m json.tool "{{ omnia_input_dir }}/software_config.json" + register: software_config_json_validation + changed_when: false + +- name: Fail if software_config.json JSON validation fails + ansible.builtin.fail: + msg: "JSON validation failed after restoring software_config.json" + when: software_config_json_validation.rc != 0 + +- name: Display software_config.json restore summary + ansible.builtin.debug: + msg: | + software_config.json restored from backup. + Backup preserved at: {{ backup_location }}/software_config.json + Restored to: {{ omnia_input_dir }}/software_config.json diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml new file mode 100644 index 0000000000..051bbfb13c --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml @@ -0,0 +1,192 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate backup_location is provided + ansible.builtin.fail: + msg: "backup_location must be provided to run network_spec.yml upgrade" + when: backup_location is not defined or (backup_location | string | trim) == "" + +- name: Ensure backup directory exists + ansible.builtin.file: + path: "{{ backup_location }}" + state: directory + mode: '0755' + +- name: Check if backup network_spec.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/network_spec.yml" + register: backup_network_spec_stat + +- name: Fail if backup network_spec.yml is not present + ansible.builtin.fail: + msg: "Backup network_spec.yml is not present at {{ backup_location }}/network_spec.yml" + when: not backup_network_spec_stat.stat.exists + +- name: Check if network_spec.yml exists + ansible.builtin.stat: + path: "{{ omnia_input_dir }}/network_spec.yml" + register: network_spec_stat + +- name: Fail if network_spec.yml is not present + ansible.builtin.fail: + msg: "network_spec.yml is not present at {{ omnia_input_dir }}/network_spec.yml" + when: not network_spec_stat.stat.exists + +- name: Read existing network_spec.yml + ansible.builtin.slurp: + src: "{{ omnia_input_dir }}/network_spec.yml" + register: network_spec_slurp + when: network_spec_stat.stat.exists + +- name: Parse existing network_spec.yml + ansible.builtin.set_fact: + network_spec_existing: "{{ network_spec_slurp.content | b64decode | from_yaml }}" + when: network_spec_stat.stat.exists + +- name: Check if network_spec.yml is already in Omnia 2.1 format + ansible.builtin.set_fact: + network_spec_already_21: >- + {{ + (network_spec_existing.schema_version | default('') | string) == '2.1' + and (network_spec_existing.Networks is defined) + and ((network_spec_existing.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) > 0) + }} + when: network_spec_stat.stat.exists + +- name: Skip transformation when network_spec.yml is already in 2.1 format + ansible.builtin.debug: + msg: "network_spec.yml is already in Omnia 2.1 format. Skipping transformation." + when: network_spec_already_21 | default(false) | bool + +- name: Read backup network_spec.yml (Omnia 2.0 source) + ansible.builtin.slurp: + src: "{{ backup_location }}/network_spec.yml" + register: backup_network_spec_slurp + when: not (network_spec_already_21 | default(false) | bool) + +- name: Parse backup network_spec.yml + ansible.builtin.set_fact: + backup_network_spec: "{{ backup_network_spec_slurp.content | b64decode | from_yaml }}" + when: not (network_spec_already_21 | default(false) | bool) + +- name: Extract admin_network and ib_network from backup file + ansible.builtin.set_fact: + admin_network: >- + {{ + (backup_network_spec.admin_network + if (backup_network_spec is mapping and backup_network_spec.admin_network is defined) + else + ( + (backup_network_spec.Networks | default([]) + | select('mapping') + | selectattr('admin_network', 'defined') + | map(attribute='admin_network') + | first + ) | default({}) + ) + ) + }} + ib_network: >- + {{ + (backup_network_spec.ib_network + if (backup_network_spec is mapping and backup_network_spec.ib_network is defined) + else + ( + (backup_network_spec.Networks | default([]) + | select('mapping') + | selectattr('ib_network', 'defined') + | map(attribute='ib_network') + | first + ) | default({}) + ) + ) + }} + when: + - not (network_spec_already_21 | default(false) | bool) + +- name: Render network_spec.yml in Omnia 2.1 format + ansible.builtin.template: + src: network_spec.j2 + dest: "{{ omnia_input_dir }}/network_spec.yml" + mode: '0644' + vars: + admin_network_netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" + when: not (network_spec_already_21 | default(false) | bool) + +- name: Read transformed network_spec.yml + ansible.builtin.slurp: + src: "{{ omnia_input_dir }}/network_spec.yml" + register: network_spec_21_slurp + when: not (network_spec_already_21 | default(false) | bool) + +- name: Parse transformed network_spec.yml + ansible.builtin.set_fact: + network_spec_21: "{{ network_spec_21_slurp.content | b64decode | from_yaml }}" + when: not (network_spec_already_21 | default(false) | bool) + +- name: Validate YAML syntax of transformed network_spec.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ omnia_input_dir }}/network_spec.yml','r'))" + register: network_spec_yaml_validation + changed_when: false + when: not (network_spec_already_21 | default(false) | bool) + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "YAML validation failed after transforming network_spec.yml" + when: + - not (network_spec_already_21 | default(false) | bool) + - network_spec_yaml_validation.rc != 0 + +- name: Ensure ib_network.netmask_bits matches admin_network.netmask_bits + ansible.builtin.fail: + msg: "ib_network.netmask_bits must match admin_network.netmask_bits in Omnia 2.1" + when: + - not (network_spec_already_21 | default(false) | bool) + - >- + (ib_network.netmask_bits | default(admin_network.netmask_bits | default('24')) | string) + != (admin_network.netmask_bits | default('24') | string) + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "Using backup as input source: {{ backup_location }}/network_spec.yml (backup is not modified)" + when: not (network_spec_already_21 | default(false) | bool) + +- name: Validate mandatory ib_network is present in transformed output + ansible.builtin.fail: + msg: "ib_network is mandatory in Omnia 2.1 network_spec.yml" + when: + - not (network_spec_already_21 | default(false) | bool) + - >- + (network_spec_21.Networks is not defined) + or ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) == 0) + +- name: Validate mandatory ib_network.subnet is present in transformed output + ansible.builtin.fail: + msg: "ib_network.subnet is mandatory in Omnia 2.1 network_spec.yml" + when: + - not (network_spec_already_21 | default(false) | bool) + - >- + ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | map(attribute='ib_network') | first | default({})).subnet | default('') | string | trim) == '' + +- name: Display transformation summary + ansible.builtin.debug: + msg: | + network_spec.yml upgraded to Omnia 2.1 format. + Backup preserved at: {{ backup_location }}/network_spec.yml + Key changes: + - Added mandatory ib_network section + - primary_oim_bmc_ip treated as optional + - ib_network.netmask_bits aligned with admin_network.netmask_bits diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2 index 98d3073c0f..773a11446c 100644 --- a/upgrade/roles/import_input_parameters/templates/network_spec.j2 +++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2 @@ -1,14 +1,55 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# This file is used to specify the network configuration. +# +# 'admin_network' is a mandatory field, essential for PXE boot and host communication." +# +# The 'admin_network' section contains the following variables: +# - 'oim_nic_name': The name of the interface on the OIM server associated with the admin network. +# - 'netmask_bits': The number of bits in the subnet mask. +# - 'primary_oim_admin_ip': The admin IP address of the OIM server which is configured. +# - 'primary_oim_bmc_ip': The iDRAC IP address of the OIM server, +# Mandatory only if idrac_telemetry is set to true and telemetry data needs to be collected from the OIM server. +# Optional — can be omitted if iDRAC telemetry for the OIM server is not required. +# - 'dynamic_range': The range of dynamic IP addresses available on the admin network. +# - 'dns': The list of external DNS server IP address for the admin network. +# - 'ntp_servers': The list of NTP servers for the admin network. Each NTP server entry should include: +# - 'address': The IP address or hostname of the NTP server. +# - 'type': The type of NTP entry, either 'server' or 'pool'. +# Example: +# ntp_servers: +# - { address: "172.16.10.80", type: "server" } + +# 'ib_network' is a mandatory field, essential for IB network configuration. +# The 'ib_network' section contains the following variables: +# - 'subnet': The subnet of the IB network. +# - 'netmask_bits': The number of bits in the subnet mask. This value must be same as the admin_network netmask_bits. + +Networks: +- admin_network: + oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}" + netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" + primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}" +{% if (admin_network.primary_oim_bmc_ip is defined) and ((admin_network.primary_oim_bmc_ip | string | trim) != '') %} + primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip }}" +{% endif %} + dynamic_range: "{{ admin_network.dynamic_range | default('') }}" + dns: {{ admin_network.dns | default([]) }} + ntp_servers: {{ admin_network.ntp_servers | default([]) }} + +- ib_network: + subnet: "{{ ib_network.subnet | default('192.168.0.0') }}" + netmask_bits: "{{ ib_network.netmask_bits | default(admin_network_netmask_bits | default('24')) }}" diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index f4c5b1b7cb..3c44a98130 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -12,3 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +omnia_input_dir: /opt/omnia/input/project_default + +backup_location: /opt/omnia/backup/upgrade \ No newline at end of file From f656eb75aaa9e0d9373cd209a6296c1a380a239e Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 5 Feb 2026 17:49:53 +0530 Subject: [PATCH 036/172] Update main.yml --- upgrade/roles/import_input_parameters/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 3c44a98130..c44a5bbb87 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -15,4 +15,4 @@ omnia_input_dir: /opt/omnia/input/project_default -backup_location: /opt/omnia/backup/upgrade \ No newline at end of file +backup_location: /opt/omnia/backups/upgrade \ No newline at end of file From 12eed55a3eb56bdd6276947f34fc90cfac6dbb30 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 5 Feb 2026 18:10:07 +0530 Subject: [PATCH 037/172] validation for keys of confs --- .../common_utils/slurm_conf_utils.py | 47 +++++++++++++++- .../validation_flows/common_validation.py | 38 +++++++++---- common/library/modules/slurm_conf.py | 55 +++---------------- 3 files changed, 82 insertions(+), 58 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 0e59272815..8deb85febb 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -14,8 +14,9 @@ # These are the slurm options for version - 25.11 import re +import os from enum import Enum - +from collections import OrderedDict class SlurmParserEnum(str, Enum): """Enumeration of Slurm configuration parameter types for parsing and validation.""" @@ -545,6 +546,50 @@ class SlurmParserEnum(str, Enum): _HOSTLIST_RE = re.compile( r'^(?P[^\[\]]*)\[(?P[^\[\]]+)\](?P.*)$') +def get_invalid_keys(conf_dict, conf_name): + """Get invalid configuration keys by comparing against expected keys.""" + current_conf = all_confs.get(conf_name, {}) + # get difference between conf_dict keys and current_conf keys + diff = set(conf_dict.keys()).difference(set(current_conf.keys())) + return list(diff) + +def parse_slurm_conf(file_path, conf_name, validate): + """Parses the slurm.conf file and returns it as a dictionary.""" + current_conf = all_confs.get(conf_name, {}) + slurm_dict = OrderedDict() + + if not os.path.exists(file_path): + raise FileNotFoundError(f"{file_path} not found.") + + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + # handles any comment after the data + line = line.split('#')[0].strip() + if not line: + continue + # Split the line by one or more spaces + items = line.split() + tmp_dict = OrderedDict() + for item in items: + # Split only on the first '=' to allow '=' inside the value + key, value = item.split('=', 1) + tmp_dict[key.strip()] = value.strip() + skey = list(tmp_dict.keys())[0] + if validate and skey not in current_conf: + raise ValueError(f"Invalid key while parsing {file_path}: {skey}") + if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY: + slurm_dict[list(tmp_dict.keys())[0]] = list( + slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict] + elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV: + existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()] + new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()] + slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values))) + elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST: + slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values()) + else: + slurm_dict.update(tmp_dict) + + return slurm_dict def expand_hostlist(expr): """ diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 06f33be0e4..52fea1ced5 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -36,11 +36,14 @@ from ansible.module_utils.local_repo.software_utils import ( load_json, - load_yaml, get_subgroup_dict, get_software_names, get_json_file_path ) +from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import ( + parse_slurm_conf, + get_invalid_keys +) file_names = config.files create_error_msg = validation_utils.create_error_msg @@ -1058,16 +1061,29 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - # config_paths_list = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] - # for cfg_path_dict in config_paths_list: - # for k,v in cfg_path_dict.items(): - # if isinstance(v, str) and not os.path.exists(v): - # errors.append( - # create_error_msg( - # input_file_path, - # "slurm config_paths", - # f"config_path for {k} - {v} does not exist" - # )) + cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + for cfg_path_dict in cnfg_src: + for k,v in cfg_path_dict.items(): + if isinstance(v, str): + if not os.path.exists(v): + errors.append( + create_error_msg(input_file_path, "slurm_cluster config_sources", + f"provided conf path for {k} - {v} does not exist")) + else: # path and also exists + conf_dict = parse_slurm_conf(v, k, False) + # module.exit_json(failed=True, result=conf_dict) + invalid_keys = get_invalid_keys(conf_dict, k) + if invalid_keys: + errors.append( + create_error_msg(input_file_path, "slurm_cluster config_sources", + f"invalid keys found in {k} - {invalid_keys}")) + else: + invalid_keys = get_invalid_keys(v, k) + if invalid_keys: + errors.append( + create_error_msg(input_file_path, "slurm_cluster config_sources", + f"invalid keys found in {k} - {invalid_keys}")) + return errors diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py index 9b9441e493..a782cb1f79 100644 --- a/common/library/modules/slurm_conf.py +++ b/common/library/modules/slurm_conf.py @@ -12,6 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +from collections import OrderedDict +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import ( + SlurmParserEnum, + all_confs, + parse_slurm_conf +) + DOCUMENTATION = r''' --- module: slurm_conf @@ -134,12 +143,6 @@ # - Hostlist expressions, split and merge computations -from collections import OrderedDict -from ansible.module_utils.basic import AnsibleModule -from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import SlurmParserEnum, all_confs -import os - - def read_dict2ini(conf_dict): """Convert a configuration dictionary to INI-style lines for slurm.conf.""" data = [] @@ -147,7 +150,6 @@ def read_dict2ini(conf_dict): if isinstance(v, list): for dct_item in v: if isinstance(dct_item, dict): - # TODO: Ordered dict, move the key to the top od = OrderedDict(dct_item) od.move_to_end(k, last=False) # Move k to the beginning data.append( @@ -159,45 +161,6 @@ def read_dict2ini(conf_dict): return data -def parse_slurm_conf(file_path, conf_name, validate): - """Parses the slurm.conf file and returns it as a dictionary.""" - current_conf = all_confs.get(conf_name, {}) - slurm_dict = OrderedDict() - - if not os.path.exists(file_path): - raise FileNotFoundError(f"{file_path} not found.") - - with open(file_path, 'r', encoding='utf-8') as f: - for line in f: - # handles any comment after the data - line = line.split('#')[0].strip() - if not line: - continue - # Split the line by one or more spaces - items = line.split() - tmp_dict = OrderedDict() - for item in items: - # Split only on the first '=' to allow '=' inside the value - key, value = item.split('=', 1) - tmp_dict[key.strip()] = value.strip() - skey = list(tmp_dict.keys())[0] - if validate and skey not in current_conf: - raise ValueError(f"Invalid key while parsing {file_path}: {skey}") - if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY: - slurm_dict[list(tmp_dict.keys())[0]] = list( - slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict] - elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV: - existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()] - new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()] - slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values))) - elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST: - slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values()) - else: - slurm_dict.update(tmp_dict) - - return slurm_dict - - def slurm_conf_dict_merge(conf_dict_list, conf_name): """Merge multiple Slurm configuration dictionaries into a single dictionary.""" merged_dict = OrderedDict() From 299fdaf8e4973c2e2bf889546bb5200d4d427c0e Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Thu, 5 Feb 2026 20:49:48 +0530 Subject: [PATCH 038/172] added code to fail when requested image version doesn't exist Signed-off-by: Katakam-Rakesh --- .../local_repo/container_repo_utils.py | 40 +++++++++++++++++-- .../local_repo/process_parallel.py | 14 +++++-- .../module_utils/local_repo/software_utils.py | 21 ++++++---- 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py index d8d97465d8..914d7bff56 100644 --- a/common/library/module_utils/local_repo/container_repo_utils.py +++ b/common/library/module_utils/local_repo/container_repo_utils.py @@ -109,13 +109,47 @@ def sync_container_repository(repo_name, remote_name, package_content, logger): bool: True if the synchronization is successful, False otherwise. """ try: + logger.info(f"Getting repository version before sync for {repo_name}") + verify_command = pulp_container_commands["show_container_repo"] % repo_name + verify_result_before = execute_command(verify_command, logger, type_json=True) + + version_before = None + if verify_result_before and isinstance(verify_result_before, dict) and "stdout" in verify_result_before: + repo_data_before = verify_result_before["stdout"] + if isinstance(repo_data_before, dict): + version_before = repo_data_before.get("latest_version_href") + logger.info(f"Repository version before sync: {version_before}") + command = pulp_container_commands["sync_container_repository"] % (repo_name, remote_name) result = execute_command(command,logger) if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): + logger.error(f"Sync command failed for repository {repo_name}") return False - else: - result = create_container_distribution(repo_name,package_content,logger) - return result + + logger.info(f"Validating sync result for repository {repo_name}") + verify_result_after = execute_command(verify_command, logger, type_json=True) + + if verify_result_after and isinstance(verify_result_after, dict) and "stdout" in verify_result_after: + repo_data_after = verify_result_after["stdout"] + if isinstance(repo_data_after, dict): + version_after = repo_data_after.get("latest_version_href") + logger.info(f"Repository version after sync: {version_after}") + + if not version_after or version_after.endswith("/versions/0/"): + logger.error(f"Sync completed but no content was downloaded for {repo_name}. " + f"The specified image tag likely does not exist in the upstream registry.") + return False + + if version_before and version_after and version_before == version_after: + logger.error(f"Sync completed but repository version did not change for {repo_name}. " + f"Version remained at {version_after}. " + f"The specified image tag likely does not exist in the remote registry.") + return False + + logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}") + + result = create_container_distribution(repo_name,package_content,logger) + return result except Exception as e: logger.error(f"Failed to synchronize repository {repo_name} with remote {remote_name}. Error: {e}") return False diff --git a/common/library/module_utils/local_repo/process_parallel.py b/common/library/module_utils/local_repo/process_parallel.py index cfc3beb920..74a24504b7 100644 --- a/common/library/module_utils/local_repo/process_parallel.py +++ b/common/library/module_utils/local_repo/process_parallel.py @@ -201,6 +201,13 @@ def execute_task(task, determine_function, user_data, version_variables, arc, with log_lock: logger.info(f"### {execute_task.__name__} start ###") # Log task start + # Build package display name with tag for images + package_display = task.get("package", "") + if task.get("type") == "image" and "tag" in task: + package_display = f"{package_display}:{task['tag']}" + elif task.get("type") == "image" and "digest" in task: + package_display = f"{package_display}:{task['digest']}" + # Determine the function and its arguments using the provided `determine_function` function, args = determine_function(task, repo_store_path, csv_file_path, user_data, version_variables, arc, user_registries, docker_username, docker_password) @@ -217,7 +224,7 @@ def execute_task(task, determine_function, user_data, version_variables, arc, ) return { "task": task, - "package": task.get("package", ""), # Extract package name if available + "package": package_display, "status": "TIMEOUT", "output": "", "error": f"Timeout reached after {elapsed_time:.2f}s" @@ -240,7 +247,7 @@ def execute_task(task, determine_function, user_data, version_variables, arc, return { "task": task, - "package": task.get("package", ""), + "package": package_display, "status": result.upper(), "output": result, "error": "" @@ -251,12 +258,11 @@ def execute_task(task, determine_function, user_data, version_variables, arc, logger.error(f"Task failed: {str(e)}") return { "task": task, - "package": task.get("package", ""), + "package": package_display, "status": "FAILED", "output": "", "error": str(e) # Include the error message } - def worker_process(task, determine_function, user_data, version_variables, arc, repo_store_path, csv_file_path, log_dir, result_queue, user_registries, docker_username, docker_password, timeout): diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index 6c78c51f3f..f1840be158 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -712,16 +712,21 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger): raise for pkg in all_packages: - if pkg["type"] == "image": - pkg_prefix = pkg.get("package", "").strip() - prefix_found = any(name.startswith(f"{pkg_prefix}:") for name in names) - if not prefix_found: - new_packages.append(pkg) + # Check exact package:tag or package:digest combination + pkg_base = pkg.get("package", "").strip() + pkg_identifier = pkg_base + + if "tag" in pkg: + pkg_identifier += f":{pkg['tag']}" + elif "digest" in pkg: + pkg_identifier += f":{pkg['digest']}" + + if pkg_identifier not in names: + new_packages.append(pkg) else: if pkg.get("package") not in names: new_packages.append(pkg) - logger.info("New packages list: %s", new_packages) logger.info("Finished get_new_packages_not_in_status()") @@ -828,7 +833,9 @@ def remove_duplicates_from_trans(trans): type_ = item.get("type") if type_ == "image": - key = (item.get("package"), item.get("tag")) + # Use digest if present, otherwise use tag + identifier = item.get("digest") or item.get("tag") + key = (item.get("package"), identifier) elif type_ == "pip_module": key = item.get("package") From f3e4050eaf03b058d4582f25533ea406808886d1 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Fri, 6 Feb 2026 08:22:33 +0530 Subject: [PATCH 039/172] updated omnia.sh for upgrade --- omnia.sh | 248 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 202 insertions(+), 46 deletions(-) diff --git a/omnia.sh b/omnia.sh index c997d2ff97..358cde2162 100755 --- a/omnia.sh +++ b/omnia.sh @@ -52,6 +52,36 @@ is_local_ip() { fi } +OMNIA_BASE_DIR="/opt/omnia" +OMNIA_INPUT_DIR="/opt/omnia/input" +OMNIA_BACKUPS_DIR="/opt/omnia/backups" +OMNIA_METADATA_DIR="/opt/omnia/.data" +OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" + +update_metadata_upgrade_backup_dir() { + local backup_dir="$1" + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running" + return 1 + fi + + podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$OMNIA_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then + sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE' + else + echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE' + fi + " +} + + + check_internal_nfs_export() { nfs_server_ip=$1 nfs_server_share_path=$2 @@ -757,9 +787,9 @@ EOF # Create the .data directory if it does not exist. # This is where the oim_metadata.yml file is stored. echo -e "${GREEN} Creating the .data directory if it does not exist.${NC}" - mkdir -p "$omnia_path/omnia/.data" + mkdir -p "$OMNIA_METADATA_DIR" - oim_metadata_file="$omnia_path/omnia/.data/oim_metadata.yml" + oim_metadata_file="$OMNIA_METADATA_FILE" if [ ! -f "$oim_metadata_file" ]; then echo -e "${GREEN} Creating oim_metadata file${NC}" @@ -811,7 +841,7 @@ EOF if ! podman ps --format '{{.Names}}' | grep -qw "$container_name"; then echo -e "${RED}Error: $container_name container failed to start.${NC}" - rm -rf "$omnia_path/omnia/.data/oim_metadata.yml" + rm -rf "$OMNIA_METADATA_FILE" exit 1 fi @@ -832,17 +862,17 @@ post_setup_config() { chmod 757 "$omnia_path/omnia/tmp/.ansible/tmp" # Create the input directory if it does not exist. echo -e "${GREEN} Creating the input directory if it does not exist.${NC}" - mkdir -p "$omnia_path/omnia/input/" + mkdir -p "$OMNIA_INPUT_DIR/" # Create the default.yml file if it does not exist. # This file contains the name of the project. - if [ ! -f "$omnia_path/omnia/input/default.yml" ]; then + if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then echo -e "${BLUE} Creating default.yml file.${NC}" { echo "# This file defines the project name." echo "# The name of the project should be set in a directory under input." echo "project_name: project_default" - } >> "$omnia_path/omnia/input/default.yml" + } >> "$OMNIA_INPUT_DIR/default.yml" fi # Copy input files from /omnia to /opt/omnia/project_default/ inside omnia_core container @@ -925,16 +955,17 @@ start_container_session() { } show_help() { - echo "Usage: $0 [--install | --uninstall | --version | --help]" + echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]" echo " -i, --install Install and start the Omnia core container" echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" + echo " --upgrade Upgrade the Omnia core container from image tag 1.0 to 1.1" echo " -v, --version Display Omnia version information" echo " -h, --help More information about usage" } install_omnia_core() { local omnia_core_tag="1.1" - local omnia_core_registry="docker.io/dellhpcomniaaisolution" + local omnia_core_registry="" # Check if local omnia_core:1.1 exists if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then @@ -945,44 +976,20 @@ install_omnia_core() { # Tag it as 1.1 for consistency podman tag omnia_core:latest omnia_core:${omnia_core_tag} else - # Try pulling from Docker Hub with retry logic - echo -e "${BLUE}Omnia core image not found locally. Attempting to pull from Docker Hub...${NC}" - pull_success=false - max_retries=3 - retry_count=0 - - while [ $retry_count -lt $max_retries ]; do - retry_count=$((retry_count + 1)) - echo -e "${BLUE}Attempt $retry_count of $max_retries...${NC}" - - if podman pull ${omnia_core_registry}/omnia_core:${omnia_core_tag} 2>/dev/null; then - echo -e "${GREEN}✓ Successfully pulled omnia_core:${omnia_core_tag} from Docker Hub.${NC}" - # Tag it without registry prefix for local use - podman tag ${omnia_core_registry}/omnia_core:${omnia_core_tag} omnia_core:${omnia_core_tag} - pull_success=true - break - else - if [ $retry_count -lt $max_retries ]; then - echo -e "${YELLOW}Pull failed. Retrying in 5 seconds...${NC}" - sleep 5 - fi - fi - done - - if [ "$pull_success" = false ]; then - echo -e "${RED}ERROR: Failed to pull omnia_core image after $max_retries attempts.${NC}" - echo "" - echo -e "${YELLOW}To resolve this, please follow these steps:${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally:" - echo -e " ./build_images.sh core omnia_branch=" - echo -e "4. After building the image, re-run this script:" - echo -e " ./omnia.sh --install" - exit 1 - fi + echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}" + echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}" + echo "" + echo -e "${YELLOW}One way to build the image locally:${NC}" + echo -e "1. Clone the Omnia Artifactory repository:" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container" + echo -e "2. Navigate to the repository directory:" + echo -e " cd omnia-artifactory" + echo -e "3. Build the core image locally (loads into local Podman by default):" + echo -e " ./build_images.sh core omnia_branch=" + echo "" + echo -e "${YELLOW}Then re-run:${NC}" + echo -e " ./omnia.sh --install" + exit 1 fi # Check if any other containers with 'omnia' in their name are running @@ -1139,6 +1146,152 @@ display_version() { exit 0 } +phase1_validate() { + local current_image + local core_config + local previous_omnia_version + local shared_path + + echo "[INFO] [ORCHESTRATOR] Phase 1: Pre-Upgrade Validation" + + if [ "$(id -u)" -ne 0 ]; then + if ! sudo -n true >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: run as root or configure passwordless sudo" + return 1 + fi + fi + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running" + return 1 + fi + + core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml' 2>/dev/null) + if [ -z "$core_config" ]; then + echo "[ERROR] [ORCHESTRATOR] Unable to read oim_metadata.yml from omnia_core container" + return 1 + fi + + previous_omnia_version=$(echo "$core_config" | grep "^omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r') + if [ -z "$previous_omnia_version" ]; then + echo "[ERROR] [ORCHESTRATOR] omnia_version not found in oim_metadata.yml" + return 1 + fi + + if [ "$previous_omnia_version" != "2.0.0.0" ]; then + echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version" + return 1 + fi + + shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -z "$shared_path" ]; then + echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml" + return 1 + fi + + omnia_path="$shared_path" + + if [ ! -d "$omnia_path" ]; then + echo "[ERROR] [ORCHESTRATOR] Shared path from metadata does not exist on host: $omnia_path" + return 1 + fi + + if [ ! -w "$omnia_path" ]; then + echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on shared path: $omnia_path" + return 1 + fi + + current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null) + if [ -z "$current_image" ]; then + echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image" + return 1 + fi + + if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then + echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)" + + + if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" + echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed" + return 0 +} + +phase2_approval() { + local backup_base default_backup_dir + + echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate" + echo "============================================" + echo "OMNIA UPGRADE SUMMARY" + echo "============================================" + echo "Current Container Tag: 1.0" + echo "Target Container Tag: 1.1" + echo "Current Omnia Release: 2.0.0.0" + echo "Target Omnia Release: 2.1.0.0" + echo "New Features:" + echo " - Add and remove node for slurm cluster" + echo " - Additional Package Installation" + echo "============================================" + + default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade" + backup_base="$default_backup_dir" + + echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base" + + if ! update_metadata_upgrade_backup_dir "$backup_base"; then + echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata" + return 1 + fi + + read -p "Proceed with upgrade? (y/N): " confirm + if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then + echo "[INFO] [ORCHESTRATOR] Upgrade cancelled by user" + return 1 + fi + + OMNIA_UPGRADE_BACKUP_PATH="$backup_base" + export OMNIA_UPGRADE_BACKUP_PATH + + echo "[INFO] [ORCHESTRATOR] Phase 2: Approval granted" + return 0 +} + +upgrade_omnia_core() { + local lock_file="/var/lock/omnia_core_upgrade.lock" + + if [ -e "$lock_file" ]; then + echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}" + exit 1 + fi + + mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true + echo "$$" > "$lock_file" || { + echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}" + exit 1 + } + trap 'rm -f "$lock_file"' EXIT + + if ! phase1_validate; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" + exit 1 + fi + + if ! phase2_approval; then + exit 0 + fi + + echo "[INFO] [ORCHESTRATOR] Upgrade tasks for backup and container swap are deferred to a follow-up PR" + exit 0 +} + # Main function to check if omnia_core container is already running. # If yes, ask the user if they want to enter the container or reinstall. # If no, set it up. @@ -1150,6 +1303,9 @@ main() { --uninstall|-u) cleanup_omnia_core ;; + --upgrade) + upgrade_omnia_core + ;; --version|-v) display_version ;; From 9bb967a19cc7dec8a879a269461b74b5b015569d Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 05:02:47 +0000 Subject: [PATCH 040/172] login-nodes directory creation --- discovery/roles/slurm_config/tasks/create_slurm_dir.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 9ce43dcd6a..35fe7910b0 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -114,7 +114,7 @@ - "{{ (ctld_list + cmpt_list + login_list + compiler_login_list) | product(common_dir) }}" - "{{ ctld_list | product(ctld_dir) }}" - "{{ dbd_list | product(db_dir) }}" - - "{{ cmpt_list | product(cmpt_dir) }}" + - "{{ ( cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}" loop_control: loop_var: product From c519d6e81a9c40ef008e516c143f25e032cbe90d Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Fri, 6 Feb 2026 08:19:00 +0000 Subject: [PATCH 041/172] Add pulp and openchami image pull prereqs --- prepare_oim/prepare_oim.yml | 11 ++++++ .../openchami/tasks/deployment_prereq.yml | 30 ++++++++++++++++ .../deploy_containers/openchami/vars/main.yml | 36 +++++++++++++++++++ .../pulp/tasks/deployment_prereq.yml | 14 ++++++++ .../deploy_containers/pulp/vars/main.yml | 4 +++ 5 files changed, 95 insertions(+) create mode 100644 prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 49bead531f..a78d21e8d9 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -97,6 +97,17 @@ name: deploy_containers/openchami # noqa:role-name[path] tasks_from: verify_openchami.yml +- name: OpenCHAMI deployment prereq + hosts: oim + connection: ssh + gather_facts: false + tags: openchami + tasks: + - name: Pull OpenCHAMI images + ansible.builtin.include_role: + name: deploy_containers/openchami # noqa:role-name[path] + tasks_from: deployment_prereq.yml + - name: Deploy the openchami container hosts: localhost connection: local diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml new file mode 100644 index 0000000000..109bc725f3 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml @@ -0,0 +1,30 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Pull OpenCHAMI images using Podman + ansible.builtin.command: + cmd: "podman pull {{ item }}" + loop: "{{ openchami_images }}" + register: pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pull_result.rc == 0 + changed_when: false + +- name: Fail if any OpenCHAMI image pull failed + ansible.builtin.fail: + msg: "Failed to pull OpenCHAMI image: {{ item.item }}. Error: {{ item.stderr }}" + loop: "{{ pull_result.results }}" + when: item.rc != 0 diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 6d0848e0af..2d7db2ca85 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -36,5 +36,41 @@ data_oci_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/oci" data_s3_dir: "{{ oim_shared_path }}/omnia/openchami/s3/data/s3" s3_work_dir: "{{ oim_shared_path }}/omnia/openchami/s3" +# Usage: deploy_openchami.yml - pull openchami images +pull_image_retries: 5 +pull_image_delay: 10 + +# OpenCHAMI image tags +openchami_local_ca_tag: "v0.2.2" +openchami_opaal_tag: "v0.3.10" +openchami_smd_tag: "v2.18.0" +openchami_bss_tag: "v1.32.0" +openchami_cloud_init_tag: "v1.2.3" +openchami_coredhcp_tag: "v0.3.0" +# Third-party image tags for OpenCHAMI +minio_tag: "latest" +postgres_tag: "11.5-alpine" +hydra_tag: "v2.3" +haproxy_tag: "latest" +registry_tag: "latest" +curl_tag: "latest" +acme_tag: "3.1.1" + +# OpenCHAMI images list for podman pull on OIM +openchami_images: + - "ghcr.io/openchami/local-ca:{{ openchami_local_ca_tag }}" + - "ghcr.io/openchami/opaal:{{ openchami_opaal_tag }}" + - "ghcr.io/openchami/smd:{{ openchami_smd_tag }}" + - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}" + - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}" + - "ghcr.io/openchami/coredhcp:{{ openchami_coredhcp_tag }}" + - "docker.io/minio/minio:{{ minio_tag }}" + - "docker.io/library/postgres:{{ postgres_tag }}" + - "docker.io/oryd/hydra:{{ hydra_tag }}" + - "cgr.dev/chainguard/haproxy:{{ haproxy_tag }}" + - "docker.io/library/registry:{{ registry_tag }}" + - "cgr.dev/chainguard/curl:{{ curl_tag }}" + - "docker.io/neilpang/acme.sh:{{ acme_tag }}" + # Usage: verify_openchami.yml cluster_env_key: "{{ oim_node_name | upper }}_ACCESS_TOKEN" diff --git a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml index 4ae77823a0..09ec52e6a4 100644 --- a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml +++ b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml @@ -38,6 +38,20 @@ when: hostname_enabled no_log: true +- name: Pull Pulp image using Podman + ansible.builtin.command: + cmd: "podman pull {{ pulp_image }}" + register: pulp_pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pulp_pull_result is not failed + changed_when: false + +- name: Fail if Pulp image pull failed + ansible.builtin.fail: + msg: "Failed to pull Pulp image: {{ pulp_image }}. Error: {{ pulp_pull_result.stderr }}" + when: pulp_pull_result.rc != 0 + - name: Invoke Pulp Container Deployment Tasks for HTTP ansible.builtin.include_tasks: deploy_pulp_container_http.yml when: not pulp_protocol_https diff --git a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml index 5613c13055..26dbec2dae 100644 --- a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml @@ -27,6 +27,10 @@ pulp_protocol_https: true # Tag is fixed for the Pulp container image as of 10-06-2025 pulp_image: "docker.io/pulp/pulp:3.80" +# Usage: deployment_prereq.yml - pull image retries +pull_image_retries: 5 +pull_image_delay: 10 + arg_list: - "-e PULP_WORKERS=10" - "-e PULP_API_WORKERS=10" From 799439c2ee673a769928c7ac7acad1f7cba46373 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 6 Feb 2026 14:25:38 +0530 Subject: [PATCH 042/172] Fix issue when slurm cluster not active --- .../slurm_config/tasks/check_ctld_running.yml | 70 ++++++++++--------- discovery/roles/slurm_config/tasks/confs.yml | 13 +++- .../slurm_config/tasks/create_slurm_dir.yml | 2 +- discovery/roles/slurm_config/vars/main.yml | 1 - 4 files changed, 49 insertions(+), 37 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 5f89e051b8..dacd879bf7 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Initialize ctld_state dict - ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}" - - name: Check if remote host is reachable via SSH ansible.builtin.wait_for: host: "{{ item }}" @@ -24,38 +20,44 @@ state: started delegate_to: localhost register: ssh_check + ignore_errors: true -- name: Check if slurmctld is running on remote host - ansible.builtin.service_facts: - delegate_to: "{{ item }}" - register: service_facts +- name: Block when ssh_check is success when: ssh_check is success + block: + - name: Initialize ctld_state dict + ansible.builtin.set_fact: + ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}" -- name: Update ctld_state if slurmctld is running - ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | combine({item: true}) }}" - when: - - ssh_check is success - - service_facts is success - - ansible_facts.services['slurmctld.service'] is defined - - ansible_facts.services['slurmctld.service'].state == 'running' + - name: Check if slurmctld is running on remote host + ansible.builtin.service_facts: + delegate_to: "{{ item }}" + register: service_facts -- name: Update /etc/hosts with controller hostname and IP - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '^{{ ip.value }}\s+{{ ip.key }}' - line: "{{ ip.value }} {{ ip.key }}" - state: present - loop: "{{ ip_name_map | dict2items }}" - loop_control: - loop_var: ip - delegate_to: "{{ item }}" - when: ssh_check is success + - name: Update ctld_state if slurmctld is running + ansible.builtin.set_fact: + ctld_state: "{{ ctld_state | combine({item: true}) }}" + when: + - service_facts is success + - ansible_facts.services['slurmctld.service'] is defined + - ansible_facts.services['slurmctld.service'].state == 'running' + + - name: Update /etc/hosts with controller hostname and IP + ansible.builtin.lineinfile: + path: /etc/hosts + regexp: '^{{ ip.value }}\s+{{ ip.key }}' + line: "{{ ip.value }} {{ ip.key }}" + state: present + loop: "{{ ip_name_map | dict2items }}" + loop_control: + loop_var: ip + delegate_to: "{{ item }}" -- name: Trigger the scontrol reconfigure - ansible.builtin.command: scontrol reconfigure - changed_when: scontrol_reconfig.rc == 0 - failed_when: false - register: scontrol_reconfig - delegate_to: "{{ item }}" - when: ctld_state[item] is true + - name: Trigger the scontrol reconfigure + ansible.builtin.command: scontrol reconfigure + changed_when: scontrol_reconfig.rc == 0 + failed_when: false + register: scontrol_reconfig + delegate_to: "{{ item }}" + when: + - ctld_state[item] is true diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 33315709cc..1ff30acf34 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -86,12 +86,19 @@ loop_var: existing_conf_set register: prepared_conf_lists +# All the updates to the confs follow after this point before merge - name: Prepend ClusterName and SlurmctldHost to slurm conf sources ansible.builtin.set_fact: # TODO: Change order if needed conf_merge_dict: "{{ conf_merge_dict - | combine({'slurm': [{'ClusterName': cluster_name, 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}" + | combine({'slurm': [{'ClusterName': cluster_name, 'AccountingStorageHost': dbd_list[0], 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}" when: "'slurm' in conf_merge_dict" +- name: Slurm dbd - DbdHost and StorageHost + ansible.builtin.set_fact: + conf_merge_dict: "{{ conf_merge_dict + | combine({'slurmdbd': [{'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}] + conf_merge_dict['slurmdbd']}) }}" + when: "'slurmdbd' in conf_merge_dict" + - name: Merge the confs slurm_conf: op: merge @@ -141,6 +148,10 @@ loop_control: loop_var: product +- name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS + ansible.builtin.set_fact: + conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}" + - name: Create backup directory with timestamp ansible.builtin.file: path: "{{ backup_dir }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 35fe7910b0..81a08adfca 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -114,7 +114,7 @@ - "{{ (ctld_list + cmpt_list + login_list + compiler_login_list) | product(common_dir) }}" - "{{ ctld_list | product(ctld_dir) }}" - "{{ dbd_list | product(db_dir) }}" - - "{{ ( cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}" + - "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}" loop_control: loop_var: product diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 3a8c43ad93..9722725a88 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -90,7 +90,6 @@ common_mode: "0755" slurm_dbd_mode: "0600" slurm_db_cnf_mode: "0600" dbd_slurm_conf: - AccountingStorageHost: "{{ dbd_list[0] }}" AccountingStoragePort: "{{ slurm_dbd_port }}" AccountingStorageType: accounting_storage/slurmdbd partition_params: From 61dea066b0fec79c1a7dcd91c9805cf0cfc993dc Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Fri, 6 Feb 2026 14:57:17 +0530 Subject: [PATCH 043/172] improve container images validation Signed-off-by: Katakam-Rakesh --- .../local_repo/container_repo_utils.py | 78 +++++++++++++++++-- .../module_utils/local_repo/download_image.py | 4 +- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py index 914d7bff56..3b8eb29662 100644 --- a/common/library/module_utils/local_repo/container_repo_utils.py +++ b/common/library/module_utils/local_repo/container_repo_utils.py @@ -98,13 +98,15 @@ def create_container_distribution(repo_name,package_content,logger): logger.error(f"Error creating distribution {repo_name}: {e}") return False -def sync_container_repository(repo_name, remote_name, package_content, logger): +def sync_container_repository(repo_name, remote_name, package_content, logger, tag=None): """ Synchronizes and distribute container repository with a remote. Args: repo_name (str): The name of the repository. remote_name (str): The name of the remote. package_content (str): Upstream name. + logger: Logger instance. + tag (str, optional): The tag to validate in repository content. Returns: bool: True if the synchronization is successful, False otherwise. """ @@ -141,10 +143,76 @@ def sync_container_repository(repo_name, remote_name, package_content, logger): return False if version_before and version_after and version_before == version_after: - logger.error(f"Sync completed but repository version did not change for {repo_name}. " - f"Version remained at {version_after}. " - f"The specified image tag likely does not exist in the remote registry.") - return False + # Check if tag actually exists using precise Pulp commands + try: + # Step 1: Get distribution to find repository href + dist_command = f"pulp container distribution show --name {repo_name}" + dist_result = execute_command(dist_command, logger, type_json=True) + + if not dist_result or not isinstance(dist_result, dict) or "stdout" not in dist_result: + logger.error(f"Failed to get distribution info for {repo_name}. Assuming tag doesn't exist.") + return False + + dist_data = dist_result["stdout"] + if not isinstance(dist_data, dict) or "repository" not in dist_data: + logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.") + return False + + repo_href = dist_data["repository"] + logger.info(f"Found repository href: {repo_href}") + + # Step 2: Get repository version href + repo_command = f"pulp container repository show --href {repo_href}" + repo_result = execute_command(repo_command, logger, type_json=True) + + if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result: + logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_data = repo_result["stdout"] + if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data: + logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_ver_href = repo_data["latest_version_href"] + logger.info(f"Found repository version href: {repo_ver_href}") + + # Step 3: Check if tag exists in content + tags_command = f"pulp show --href '/pulp/api/v3/content/container/tags/?repository_version={repo_ver_href}'" + tags_result = execute_command(tags_command, logger, type_json=True) + + if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result: + logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags_data = tags_result["stdout"] + if not isinstance(tags_data, dict) or "results" not in tags_data: + logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags = tags_data["results"] + tag_exists = False + + # Use the tag parameter if provided, otherwise fall back to checking package_content + tag_to_check = tag if tag else package_content + + for tag_item in tags: + if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check: + tag_exists = True + break + + if tag_exists: + logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.") + else: + logger.error(f"Sync completed but repository version did not change for {repo_name}. " + f"Version remained at {version_after}. " + f"Tag '{tag_to_check}' does not exist in Pulp repository content. " + f"This indicates the tag likely does not exist in the upstream registry.") + return False + + except Exception as e: + logger.error(f"Error checking repository tag existence: {e}. Assuming tag doesn't exist.") + return False logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}") diff --git a/common/library/module_utils/local_repo/download_image.py b/common/library/module_utils/local_repo/download_image.py index ffc5518177..98a1cb5b66 100644 --- a/common/library/module_utils/local_repo/download_image.py +++ b/common/library/module_utils/local_repo/download_image.py @@ -345,8 +345,10 @@ def process_image(package, status_file_path, version_variables, raise Exception(f"Failed to create remote: {remote_name}") # Sync and distribute + # Pass tag_val if it exists (for tag-based images), otherwise None (for digest-based images) + tag_to_pass = tag_val if "tag" in package else None result = sync_container_repository( - repository_name, remote_name, package_content, logger + repository_name, remote_name, package_content, logger, tag=tag_to_pass ) if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): raise Exception(f"Failed to sync repository: {repository_name}") From 24acd8ce980222ef66ad1a660a5db48875c5fc6b Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 15:29:29 +0530 Subject: [PATCH 044/172] Upgrade of template handling logic and high_availability_config.yml Addition of upgrade logic for high_availability_config.yml and template handling logic --- .../import_input_parameters/tasks/main.yml | 11 +- .../tasks/precheck_backup_location.yml | 25 ++++ .../tasks/restore_input_files.yml | 25 ++++ .../tasks/restore_pxe_mapping_file.yml | 49 -------- .../tasks/restore_single_input_file.yml | 54 +++++++++ .../tasks/restore_software_config.yml | 60 --------- .../transform_high_availability_config.yml | 114 ++++++++++++++++++ .../tasks/transform_network_spec.yml | 89 +++----------- .../templates/high_availability_config.j2 | 27 +++++ .../import_input_parameters/vars/main.yml | 52 +++++++- upgrade/upgrade_oim.yml | 1 + 11 files changed, 323 insertions(+), 184 deletions(-) create mode 100644 upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_input_files.yml delete mode 100644 upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml delete mode 100644 upgrade/roles/import_input_parameters/tasks/restore_software_config.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml create mode 100644 upgrade/roles/import_input_parameters/templates/high_availability_config.j2 diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index af45a1de1b..7687f852bb 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -13,11 +13,14 @@ # limitations under the License. --- +- name: Validate backup location for upgrade input processing + ansible.builtin.include_tasks: precheck_backup_location.yml + - name: Transform network_spec.yml from Omnia 2.0 to 2.1 ansible.builtin.include_tasks: transform_network_spec.yml -- name: Restore software_config.json from backup - ansible.builtin.include_tasks: restore_software_config.yml +- name: Transform high_availability_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_high_availability_config.yml -- name: Restore pxe_mapping_file.csv from backup - ansible.builtin.include_tasks: restore_pxe_mapping_file.yml +- name: Restore input files from backup + ansible.builtin.include_tasks: restore_input_files.yml diff --git a/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml new file mode 100644 index 0000000000..fe058f83a9 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/precheck_backup_location.yml @@ -0,0 +1,25 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate backup_location is provided + ansible.builtin.fail: + msg: "{{ msg_backup_location_missing }}" + when: backup_location is not defined or (backup_location | string | trim) == "" + +- name: Ensure backup directory exists + ansible.builtin.file: + path: "{{ backup_location }}" + state: directory + mode: "{{ backup_dir_mode }}" diff --git a/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml b/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml new file mode 100644 index 0000000000..3dd6d45206 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_input_files.yml @@ -0,0 +1,25 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate restore_input_files is defined + ansible.builtin.set_fact: + restore_input_files_effective: "{{ restore_input_files | default([]) }}" + +- name: Restore input files from backup (overwrite target) + ansible.builtin.include_tasks: restore_single_input_file.yml + loop: "{{ restore_input_files_effective }}" + loop_control: + loop_var: restore_item + when: (restore_input_files_effective | length) > 0 diff --git a/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml b/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml deleted file mode 100644 index f468359305..0000000000 --- a/upgrade/roles/import_input_parameters/tasks/restore_pxe_mapping_file.yml +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Validate backup_location is provided - ansible.builtin.fail: - msg: "backup_location must be provided to restore pxe_mapping_file.csv" - when: backup_location is not defined or (backup_location | string | trim) == "" - -- name: Ensure backup directory exists - ansible.builtin.file: - path: "{{ backup_location }}" - state: directory - mode: '0755' - -- name: Check if backup pxe_mapping_file.csv exists - ansible.builtin.stat: - path: "{{ backup_location }}/pxe_mapping_file.csv" - register: backup_pxe_mapping_stat - -- name: Fail if backup pxe_mapping_file.csv is not present - ansible.builtin.fail: - msg: "Backup pxe_mapping_file.csv is not present at {{ backup_location }}/pxe_mapping_file.csv" - when: not backup_pxe_mapping_stat.stat.exists - -- name: Overwrite pxe_mapping_file.csv in input directory from backup - ansible.builtin.copy: - src: "{{ backup_location }}/pxe_mapping_file.csv" - dest: "{{ omnia_input_dir }}/pxe_mapping_file.csv" - mode: '0644' - remote_src: true - -- name: Display pxe_mapping_file.csv restore summary - ansible.builtin.debug: - msg: | - pxe_mapping_file.csv restored from backup. - Backup preserved at: {{ backup_location }}/pxe_mapping_file.csv - Restored to: {{ omnia_input_dir }}/pxe_mapping_file.csv diff --git a/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml b/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml new file mode 100644 index 0000000000..f55d14bd3e --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_single_input_file.yml @@ -0,0 +1,54 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate restore item fields + ansible.builtin.fail: + msg: "{{ msg_restore_item_name_missing }}" + when: restore_item.name is not defined or (restore_item.name | string | trim) == "" + +- name: Check if backup file exists + ansible.builtin.stat: + path: "{{ backup_location }}/{{ restore_item.name }}" + register: restore_backup_stat + +- name: Fail if backup file is not present + ansible.builtin.fail: + msg: "{{ msg_backup_file_missing }}" + when: not restore_backup_stat.stat.exists + +- name: Overwrite input file from backup + ansible.builtin.copy: + src: "{{ backup_location }}/{{ restore_item.name }}" + dest: "{{ input_project_dir }}/{{ restore_item.name }}" + mode: "{{ restore_item.mode | default(default_file_mode) }}" + remote_src: true + +- name: Validate restored file (optional) + ansible.builtin.command: + cmd: "{{ restore_item.validate_cmd }}" + register: restore_validate + changed_when: false + when: restore_item.validate_cmd is defined and (restore_item.validate_cmd | string | trim) != "" + +- name: Fail if restored file validation fails + ansible.builtin.fail: + msg: "{{ msg_validation_failed }}" + when: + - restore_item.validate_cmd is defined and (restore_item.validate_cmd | string | trim) != "" + - restore_validate.rc != 0 + +- name: Display restore summary + ansible.builtin.debug: + msg: "{{ msg_restore_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml b/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml deleted file mode 100644 index 9891023702..0000000000 --- a/upgrade/roles/import_input_parameters/tasks/restore_software_config.yml +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Validate backup_location is provided - ansible.builtin.fail: - msg: "backup_location must be provided to restore software_config.json" - when: backup_location is not defined or (backup_location | string | trim) == "" - -- name: Ensure backup directory exists - ansible.builtin.file: - path: "{{ backup_location }}" - state: directory - mode: '0755' - -- name: Check if backup software_config.json exists - ansible.builtin.stat: - path: "{{ backup_location }}/software_config.json" - register: backup_software_config_stat - -- name: Fail if backup software_config.json is not present - ansible.builtin.fail: - msg: "Backup software_config.json is not present at {{ backup_location }}/software_config.json" - when: not backup_software_config_stat.stat.exists - -- name: Overwrite software_config.json in input directory from backup - ansible.builtin.copy: - src: "{{ backup_location }}/software_config.json" - dest: "{{ omnia_input_dir }}/software_config.json" - mode: '0644' - remote_src: true - -- name: Validate JSON syntax of software_config.json - ansible.builtin.command: - cmd: python3 -m json.tool "{{ omnia_input_dir }}/software_config.json" - register: software_config_json_validation - changed_when: false - -- name: Fail if software_config.json JSON validation fails - ansible.builtin.fail: - msg: "JSON validation failed after restoring software_config.json" - when: software_config_json_validation.rc != 0 - -- name: Display software_config.json restore summary - ansible.builtin.debug: - msg: | - software_config.json restored from backup. - Backup preserved at: {{ backup_location }}/software_config.json - Restored to: {{ omnia_input_dir }}/software_config.json diff --git a/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml new file mode 100644 index 0000000000..494dfda41a --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml @@ -0,0 +1,114 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup high_availability_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/high_availability_config.yml" + register: backup_ha_config_stat + +- name: Fail if backup high_availability_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_ha_config_missing }}" + when: not backup_ha_config_stat.stat.exists + +- name: Check if high_availability_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/high_availability_config.yml" + register: ha_config_stat + +- name: Fail if high_availability_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_ha_config_missing }}" + when: not ha_config_stat.stat.exists + +- name: Read backup high_availability_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/high_availability_config.yml" + register: backup_ha_config_slurp + +- name: Parse backup high_availability_config.yml + ansible.builtin.set_fact: + backup_ha_config: "{{ backup_ha_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize service_k8s_cluster_ha to a list + ansible.builtin.set_fact: + ha_service_k8s_cluster_ha: >- + {{ + ( + [backup_ha_config.service_k8s_cluster_ha] + if (backup_ha_config.service_k8s_cluster_ha is mapping) + else (backup_ha_config.service_k8s_cluster_ha | default([])) + ) + }} + +- name: Collect HA entries missing virtual_ip_address + ansible.builtin.set_fact: + ha_entries_missing_vip: >- + {{ + (ha_service_k8s_cluster_ha | default([])) + | select('mapping') + | selectattr('virtual_ip_address', 'undefined') + | map(attribute='cluster_name') + | list + }} + +- name: Collect HA entries with empty virtual_ip_address + ansible.builtin.set_fact: + ha_entries_empty_vip: >- + {{ + (ha_service_k8s_cluster_ha | default([])) + | select('mapping') + | selectattr('virtual_ip_address', 'defined') + | selectattr('virtual_ip_address', 'match', '^\\s*$') + | map(attribute='cluster_name') + | list + }} + +- name: Fail if virtual_ip_address is missing + ansible.builtin.fail: + msg: "{{ msg_ha_virtual_ip_missing }}" + when: + - (ha_service_k8s_cluster_ha | default([]) | length) == 0 + or ((ha_entries_missing_vip | default([]) | length) > 0) + or ((ha_entries_empty_vip | default([]) | length) > 0) + +- name: Write high_availability_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: high_availability_config.j2 + dest: "{{ input_project_dir }}/high_availability_config.yml" + mode: "{{ default_file_mode }}" + vars: + ha_service_k8s_cluster_ha: "{{ ha_service_k8s_cluster_ha }}" + +- name: Validate YAML syntax of transformed high_availability_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/high_availability_config.yml','r'))" + register: ha_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - ha_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_ha_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_ha_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml index 051bbfb13c..d4b3a92e29 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml @@ -13,17 +13,6 @@ # limitations under the License. --- -- name: Validate backup_location is provided - ansible.builtin.fail: - msg: "backup_location must be provided to run network_spec.yml upgrade" - when: backup_location is not defined or (backup_location | string | trim) == "" - -- name: Ensure backup directory exists - ansible.builtin.file: - path: "{{ backup_location }}" - state: directory - mode: '0755' - - name: Check if backup network_spec.yml exists ansible.builtin.stat: path: "{{ backup_location }}/network_spec.yml" @@ -31,55 +20,27 @@ - name: Fail if backup network_spec.yml is not present ansible.builtin.fail: - msg: "Backup network_spec.yml is not present at {{ backup_location }}/network_spec.yml" + msg: "{{ msg_backup_network_spec_missing }}" when: not backup_network_spec_stat.stat.exists - name: Check if network_spec.yml exists ansible.builtin.stat: - path: "{{ omnia_input_dir }}/network_spec.yml" + path: "{{ input_project_dir }}/network_spec.yml" register: network_spec_stat - name: Fail if network_spec.yml is not present ansible.builtin.fail: - msg: "network_spec.yml is not present at {{ omnia_input_dir }}/network_spec.yml" + msg: "{{ msg_network_spec_missing }}" when: not network_spec_stat.stat.exists -- name: Read existing network_spec.yml - ansible.builtin.slurp: - src: "{{ omnia_input_dir }}/network_spec.yml" - register: network_spec_slurp - when: network_spec_stat.stat.exists - -- name: Parse existing network_spec.yml - ansible.builtin.set_fact: - network_spec_existing: "{{ network_spec_slurp.content | b64decode | from_yaml }}" - when: network_spec_stat.stat.exists - -- name: Check if network_spec.yml is already in Omnia 2.1 format - ansible.builtin.set_fact: - network_spec_already_21: >- - {{ - (network_spec_existing.schema_version | default('') | string) == '2.1' - and (network_spec_existing.Networks is defined) - and ((network_spec_existing.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) > 0) - }} - when: network_spec_stat.stat.exists - -- name: Skip transformation when network_spec.yml is already in 2.1 format - ansible.builtin.debug: - msg: "network_spec.yml is already in Omnia 2.1 format. Skipping transformation." - when: network_spec_already_21 | default(false) | bool - -- name: Read backup network_spec.yml (Omnia 2.0 source) +- name: Read backup network_spec.yml (source of truth) ansible.builtin.slurp: src: "{{ backup_location }}/network_spec.yml" register: backup_network_spec_slurp - when: not (network_spec_already_21 | default(false) | bool) - name: Parse backup network_spec.yml ansible.builtin.set_fact: backup_network_spec: "{{ backup_network_spec_slurp.content | b64decode | from_yaml }}" - when: not (network_spec_already_21 | default(false) | bool) - name: Extract admin_network and ib_network from backup file ansible.builtin.set_fact: @@ -114,79 +75,69 @@ ) }} when: - - not (network_spec_already_21 | default(false) | bool) + - true - name: Render network_spec.yml in Omnia 2.1 format ansible.builtin.template: src: network_spec.j2 - dest: "{{ omnia_input_dir }}/network_spec.yml" - mode: '0644' + dest: "{{ input_project_dir }}/network_spec.yml" + mode: "{{ default_file_mode }}" vars: admin_network_netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" - when: not (network_spec_already_21 | default(false) | bool) + when: true - name: Read transformed network_spec.yml ansible.builtin.slurp: - src: "{{ omnia_input_dir }}/network_spec.yml" + src: "{{ input_project_dir }}/network_spec.yml" register: network_spec_21_slurp - when: not (network_spec_already_21 | default(false) | bool) + when: true - name: Parse transformed network_spec.yml ansible.builtin.set_fact: network_spec_21: "{{ network_spec_21_slurp.content | b64decode | from_yaml }}" - when: not (network_spec_already_21 | default(false) | bool) + when: true - name: Validate YAML syntax of transformed network_spec.yml ansible.builtin.command: - cmd: python3 -c "import yaml; yaml.safe_load(open('{{ omnia_input_dir }}/network_spec.yml','r'))" + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/network_spec.yml','r'))" register: network_spec_yaml_validation changed_when: false - when: not (network_spec_already_21 | default(false) | bool) + when: true - name: Fail if YAML validation fails ansible.builtin.fail: - msg: "YAML validation failed after transforming network_spec.yml" + msg: "{{ msg_yaml_validation_failed }}" when: - - not (network_spec_already_21 | default(false) | bool) - network_spec_yaml_validation.rc != 0 - name: Ensure ib_network.netmask_bits matches admin_network.netmask_bits ansible.builtin.fail: - msg: "ib_network.netmask_bits must match admin_network.netmask_bits in Omnia 2.1" + msg: "{{ msg_ib_netmask_mismatch }}" when: - - not (network_spec_already_21 | default(false) | bool) - >- (ib_network.netmask_bits | default(admin_network.netmask_bits | default('24')) | string) != (admin_network.netmask_bits | default('24') | string) - name: Display backup path (no-op when skipped) ansible.builtin.debug: - msg: "Using backup as input source: {{ backup_location }}/network_spec.yml (backup is not modified)" - when: not (network_spec_already_21 | default(false) | bool) + msg: "{{ msg_using_backup_network_spec }}" + when: true - name: Validate mandatory ib_network is present in transformed output ansible.builtin.fail: - msg: "ib_network is mandatory in Omnia 2.1 network_spec.yml" + msg: "{{ msg_ib_network_missing }}" when: - - not (network_spec_already_21 | default(false) | bool) - >- (network_spec_21.Networks is not defined) or ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) == 0) - name: Validate mandatory ib_network.subnet is present in transformed output ansible.builtin.fail: - msg: "ib_network.subnet is mandatory in Omnia 2.1 network_spec.yml" + msg: "{{ msg_ib_subnet_missing }}" when: - - not (network_spec_already_21 | default(false) | bool) - >- ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | map(attribute='ib_network') | first | default({})).subnet | default('') | string | trim) == '' - name: Display transformation summary ansible.builtin.debug: - msg: | - network_spec.yml upgraded to Omnia 2.1 format. - Backup preserved at: {{ backup_location }}/network_spec.yml - Key changes: - - Added mandatory ib_network section - - primary_oim_bmc_ip treated as optional - - ib_network.netmask_bits aligned with admin_network.netmask_bits + msg: "{{ msg_network_spec_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/templates/high_availability_config.j2 b/upgrade/roles/import_input_parameters/templates/high_availability_config.j2 new file mode 100644 index 0000000000..b116d962fe --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/high_availability_config.j2 @@ -0,0 +1,27 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# *********************************************************************** +# High Availability (HA) Configuration for Kubernetes (K8s) Service Node(List) +# - cluster_name is required field it should match one of the values defined in omnia_config.yml where deployment is set to true. +# - enable_k8s_ha: Indicates whether to enable HA for the Kubernetes (K8s) service node. Set to 'true' to enable, 'false' to disable. +# - virtual_ip_address: The virtual IP address for the K8s service node setup. +# *********************************************************************** + +{{ {'service_k8s_cluster_ha': ha_service_k8s_cluster_ha} | to_nice_yaml(indent=2) }} diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index c44a5bbb87..d7281bdc0b 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -13,6 +13,54 @@ # limitations under the License. --- -omnia_input_dir: /opt/omnia/input/project_default +backup_location: /opt/omnia/backups/upgrade -backup_location: /opt/omnia/backups/upgrade \ No newline at end of file +backup_dir_mode: '0755' +default_file_mode: '0644' + +msg_backup_location_missing: "backup_location must be provided" +msg_restore_item_name_missing: "restore_item must define 'name'" +msg_validation_failed: "Validation failed for {{ restore_item.name }}" +msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" +msg_backup_network_spec_missing: "Backup network_spec.yml missing" +msg_network_spec_missing: "network_spec.yml missing" +msg_network_spec_already_21: "network_spec.yml already in 2.1 format - overwriting" +msg_backup_ha_config_missing: "Backup high_availability_config.yml missing" +msg_ha_config_missing: "high_availability_config.yml missing" +msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting" +msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory" +msg_yaml_validation_failed: "YAML validation failed" + +msg_ib_netmask_mismatch: "ib_network.netmask_bits must match admin_network.netmask_bits" +msg_ib_network_missing: "ib_network is mandatory" +msg_ib_subnet_missing: "ib_network.subnet is mandatory" +msg_using_backup_network_spec: "Using backup network_spec.yml (backup not modified)" +msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)" + +msg_restore_summary: | + {{ restore_item.name }} restored from backup. + Backup: {{ backup_location }}/{{ restore_item.name }} + Target: {{ input_project_dir }}/{{ restore_item.name }} + +msg_network_spec_transform_summary: | + network_spec.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/network_spec.yml + Changes: + - Added mandatory ib_network + - Made primary_oim_bmc_ip optional + - Aligned ib_network.netmask_bits with admin_network.netmask_bits + +msg_ha_config_transform_summary: | + high_availability_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/high_availability_config.yml + Changes: + - Ensured service_k8s_cluster_ha is a list + - Ensured virtual_ip_address is present + +restore_input_files: + - name: software_config.json + mode: '0644' + validate_cmd: "python3 -m json.tool '{{ input_project_dir }}/software_config.json'" + - name: pxe_mapping_file.csv + mode: '0644' + validate_cmd: "" \ No newline at end of file diff --git a/upgrade/upgrade_oim.yml b/upgrade/upgrade_oim.yml index 3e91f1a479..aa6e6fb5fc 100644 --- a/upgrade/upgrade_oim.yml +++ b/upgrade/upgrade_oim.yml @@ -17,4 +17,5 @@ hosts: localhost connection: local roles: + - role: ../utils/roles/include_input_dir - role: upgrade_oim From c6866538c83789353087abdce749c19d188c3076 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 15:45:11 +0530 Subject: [PATCH 045/172] Update main.yml --- .../import_input_parameters/vars/main.yml | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index d7281bdc0b..93b328e279 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -18,30 +18,38 @@ backup_location: /opt/omnia/backups/upgrade backup_dir_mode: '0755' default_file_mode: '0644' +# Precheck backup location messages msg_backup_location_missing: "backup_location must be provided" + +# Restore input files messages msg_restore_item_name_missing: "restore_item must define 'name'" msg_validation_failed: "Validation failed for {{ restore_item.name }}" msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" + +# Network spec transformation messages msg_backup_network_spec_missing: "Backup network_spec.yml missing" msg_network_spec_missing: "network_spec.yml missing" msg_network_spec_already_21: "network_spec.yml already in 2.1 format - overwriting" -msg_backup_ha_config_missing: "Backup high_availability_config.yml missing" -msg_ha_config_missing: "high_availability_config.yml missing" -msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting" -msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory" msg_yaml_validation_failed: "YAML validation failed" - msg_ib_netmask_mismatch: "ib_network.netmask_bits must match admin_network.netmask_bits" msg_ib_network_missing: "ib_network is mandatory" msg_ib_subnet_missing: "ib_network.subnet is mandatory" msg_using_backup_network_spec: "Using backup network_spec.yml (backup not modified)" + +# High availability config transformation messages +msg_backup_ha_config_missing: "Backup high_availability_config.yml missing" +msg_ha_config_missing: "high_availability_config.yml missing" +msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting" +msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory" msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)" +### Restore summary messages msg_restore_summary: | {{ restore_item.name }} restored from backup. Backup: {{ backup_location }}/{{ restore_item.name }} Target: {{ input_project_dir }}/{{ restore_item.name }} +# Restore summary message for network spec transformation msg_network_spec_transform_summary: | network_spec.yml upgraded to 2.1 format. Backup preserved at: {{ backup_location }}/network_spec.yml @@ -50,6 +58,7 @@ msg_network_spec_transform_summary: | - Made primary_oim_bmc_ip optional - Aligned ib_network.netmask_bits with admin_network.netmask_bits +# Restore summary message for high availability config transformation msg_ha_config_transform_summary: | high_availability_config.yml upgraded to 2.1 format. Backup preserved at: {{ backup_location }}/high_availability_config.yml @@ -57,6 +66,19 @@ msg_ha_config_transform_summary: | - Ensured service_k8s_cluster_ha is a list - Ensured virtual_ip_address is present +# === Input files to restore from backup === +# Add input files here that should be copied from backup_location to input_project_dir +# Each entry should have: +# - name: filename (required) +# - mode: file permissions (optional, defaults to default_file_mode) +# - validate_cmd: validation command (optional, runs after restore) +# +# Examples of files to add: +# - Static configuration files that don't need transformation +# - Files that are the same format in 2.0 and 2.1 +# - Files where you want to preserve the backup values exactly +# +# DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml) restore_input_files: - name: software_config.json mode: '0644' From 2e3f4b4f377b167deb2e5409a3320ddfd8b617a8 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 16:14:59 +0530 Subject: [PATCH 046/172] Update network_spec.j2 --- upgrade/roles/import_input_parameters/templates/network_spec.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2 index 773a11446c..d9e41ba469 100644 --- a/upgrade/roles/import_input_parameters/templates/network_spec.j2 +++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2 @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 3c4c28636497d0dcc4250cd0f924ff6976538291 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 16:24:13 +0530 Subject: [PATCH 047/172] Update main.yml --- upgrade/roles/import_input_parameters/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 93b328e279..a87c855751 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -backup_location: /opt/omnia/backups/upgrade +backup_location: /opt/omnia/backups/upgrade/input backup_dir_mode: '0755' default_file_mode: '0644' From e13132034aeb18eb55a27794cd93ffd7124171d0 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 11:07:32 +0000 Subject: [PATCH 048/172] slurm backup and rollback feature --- utils/roles/slurm_cleanup/defaults/main.yml | 5 + utils/roles/slurm_cleanup/tasks/main.yml | 73 +++ .../slurm_config_backup/defaults/main.yml | 4 + .../roles/slurm_config_backup/tasks/main.yml | 116 +++++ .../slurm_config_rollback/defaults/main.yml | 5 + .../slurm_config_rollback/tasks/main.yml | 427 ++++++++++++++++++ utils/slurm_config_util.yml | 26 ++ 7 files changed, 656 insertions(+) create mode 100644 utils/roles/slurm_cleanup/defaults/main.yml create mode 100644 utils/roles/slurm_cleanup/tasks/main.yml create mode 100644 utils/roles/slurm_config_backup/defaults/main.yml create mode 100644 utils/roles/slurm_config_backup/tasks/main.yml create mode 100644 utils/roles/slurm_config_rollback/defaults/main.yml create mode 100644 utils/roles/slurm_config_rollback/tasks/main.yml create mode 100644 utils/slurm_config_util.yml diff --git a/utils/roles/slurm_cleanup/defaults/main.yml b/utils/roles/slurm_cleanup/defaults/main.yml new file mode 100644 index 0000000000..f54396449f --- /dev/null +++ b/utils/roles/slurm_cleanup/defaults/main.yml @@ -0,0 +1,5 @@ +--- + +slurm_share_dir_name: slurm +slurm_cleanup_pre_backup_default: 'y' +slurm_cleanup_confirm_token: 'YES' diff --git a/utils/roles/slurm_cleanup/tasks/main.yml b/utils/roles/slurm_cleanup/tasks/main.yml new file mode 100644 index 0000000000..5c59cae2d0 --- /dev/null +++ b/utils/roles/slurm_cleanup/tasks/main.yml @@ -0,0 +1,73 @@ +--- + +- name: Include variable file omnia_config.yml + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" + tags: slurm_cleanup + +- name: Include storage vars + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" + tags: slurm_cleanup + +- name: Set facts for slurm + ansible.builtin.set_fact: + nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" + tags: slurm_cleanup + +- name: Read the slurm mount point + ansible.builtin.set_fact: + share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" + tags: slurm_cleanup + +- name: Set slurm_config_path + ansible.builtin.set_fact: + slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" + tags: slurm_cleanup + +- name: Prompt for pre-cleanup backup + ansible.builtin.pause: + prompt: "Before cleanup, take a config backup? (y/n)" + register: pre_cleanup_backup + tags: slurm_cleanup + +- name: Set pre-cleanup backup choice + ansible.builtin.set_fact: + pre_cleanup_backup_choice: "{{ pre_cleanup_backup.user_input | default('') | trim | lower }}" + tags: slurm_cleanup + +- name: Fail if pre-cleanup backup choice is empty + ansible.builtin.fail: + msg: "No input provided for pre-cleanup backup prompt. Cleanup aborted." + when: pre_cleanup_backup_choice | length == 0 + tags: slurm_cleanup + +- name: Validate pre-cleanup backup choice + ansible.builtin.fail: + msg: "Invalid input '{{ pre_cleanup_backup.user_input | default('') }}'. Enter 'y' or 'n'." + when: pre_cleanup_backup_choice not in ['y', 'yes', 'n', 'no'] + tags: slurm_cleanup + +- name: Run config backup before cleanup + ansible.builtin.include_role: + name: slurm_config_backup + apply: + tags: slurm_cleanup + when: pre_cleanup_backup_choice in ['y', 'yes'] + tags: slurm_cleanup + +- name: Confirm cleanup + ansible.builtin.pause: + prompt: "This will delete {{ slurm_config_path }}. Type {{ slurm_cleanup_confirm_token }} to continue" + register: cleanup_confirm + tags: slurm_cleanup + +- name: Fail if cleanup not confirmed + ansible.builtin.fail: + msg: "Cleanup aborted" + when: cleanup_confirm.user_input != slurm_cleanup_confirm_token + tags: slurm_cleanup + +- name: Delete slurm share directory + ansible.builtin.file: + path: "{{ slurm_config_path }}" + state: absent + tags: slurm_cleanup diff --git a/utils/roles/slurm_config_backup/defaults/main.yml b/utils/roles/slurm_config_backup/defaults/main.yml new file mode 100644 index 0000000000..b631a205d0 --- /dev/null +++ b/utils/roles/slurm_config_backup/defaults/main.yml @@ -0,0 +1,4 @@ +--- + +slurm_share_dir_name: slurm +slurm_backups_dir_name: slurm_backups diff --git a/utils/roles/slurm_config_backup/tasks/main.yml b/utils/roles/slurm_config_backup/tasks/main.yml new file mode 100644 index 0000000000..4871ab705b --- /dev/null +++ b/utils/roles/slurm_config_backup/tasks/main.yml @@ -0,0 +1,116 @@ +--- + +- name: Include variable file omnia_config.yml + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" + +- name: Include storage vars + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" + +- name: Set facts for slurm + ansible.builtin.set_fact: + nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" + +- name: Read the slurm mount point + ansible.builtin.set_fact: + share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" + +- name: Display resolved slurm share path + ansible.builtin.debug: + msg: "Resolved share_path={{ share_path }} (nfs_storage_name={{ nfs_storage_name }})" + +- name: Slurp remote YAML file + ansible.builtin.slurp: + src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" + register: slurped_yaml + +- name: Parse YAML into vars + ansible.builtin.set_fact: + node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}" + +- name: Read the node name group + ansible.builtin.set_fact: + name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}" + +- name: Group the functional_groups + ansible.builtin.set_fact: + tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}" + +- name: Re-organize the groups + ansible.builtin.set_fact: + grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}" + loop: "{{ tmp_grouped_nodes }}" + +- name: Assign slurm lists + ansible.builtin.set_fact: + ctld_list: "{{ grouped_nodes | dict2items + | selectattr('key', 'match', '^' ~ 'slurm_control_node_') + | map(attribute='value') | list | flatten }}" + +- name: Fail if Slurm controller list is empty + ansible.builtin.fail: + msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun." + when: ctld_list | length == 0 + +- name: Set slurm_config_path + ansible.builtin.set_fact: + slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" + +- name: Display resolved slurm config path + ansible.builtin.debug: + msg: "Resolved slurm_config_path={{ slurm_config_path }}" + +- name: Prompt for backup base name + ansible.builtin.pause: + prompt: "Enter backup base name (leave empty for timestamp-only)" + register: backup_base_name_input + +- name: Set backup id + ansible.builtin.set_fact: + backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" + backup_base_name: "{{ backup_base_name_input.user_input | default('') }}" + +- name: Set backup directory + ansible.builtin.set_fact: + slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}" + backup_id: "{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}" + backup_dir: "{{ share_path }}/{{ slurm_backups_dir_name }}/{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}" + +- name: Ensure slurm backups root exists + ansible.builtin.file: + path: "{{ slurm_backups_root }}" + state: directory + mode: '0755' + +- name: Display slurm backups root + ansible.builtin.debug: + msg: "Resolved slurm_backups_root={{ slurm_backups_root }}" + +- name: Create backup directory + ansible.builtin.file: + path: "{{ backup_dir }}" + state: directory + mode: '0755' + +- name: Create backup config directories + ansible.builtin.file: + path: "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}" + state: directory + mode: '0755' + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + +- name: Backup controller config directories + ansible.builtin.command: >- + cp -a "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/." "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}/" + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + changed_when: true + failed_when: false + +- name: Display backup location + ansible.builtin.debug: + msg: "Slurm config backup created at: {{ backup_dir }}/{{ ctld_list[0] }}" diff --git a/utils/roles/slurm_config_rollback/defaults/main.yml b/utils/roles/slurm_config_rollback/defaults/main.yml new file mode 100644 index 0000000000..601e25cd18 --- /dev/null +++ b/utils/roles/slurm_config_rollback/defaults/main.yml @@ -0,0 +1,5 @@ +--- + +slurm_share_dir_name: slurm +slurm_backups_dir_name: slurm_backups +slurm_rollback_backup_list_limit_default: 20 diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml new file mode 100644 index 0000000000..e9822de876 --- /dev/null +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -0,0 +1,427 @@ +--- + +- name: Include variable file omnia_config.yml + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" + tags: config_rollback + +- name: Include storage vars + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" + tags: config_rollback + +- name: Set facts for slurm + ansible.builtin.set_fact: + nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" + tags: config_rollback + +- name: Read the slurm mount point + ansible.builtin.set_fact: + share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" + tags: config_rollback + +- name: Slurp remote YAML file + ansible.builtin.slurp: + src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" + register: slurped_yaml + tags: config_rollback + +- name: Parse YAML into vars + ansible.builtin.set_fact: + node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}" + tags: config_rollback + +- name: Get name and IP mapping 1 + ansible.builtin.set_fact: + tmp_ip_name_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='interfaces') }}" + tags: config_rollback + +- name: Get name and IP mapping 2 + ansible.builtin.set_fact: + ip_name_map: "{{ ip_name_map | default({}) | combine({item.key: item.value[0]['ip_addrs'][0]['ip_addr']}) }}" + loop: "{{ tmp_ip_name_map | dict2items }}" + tags: config_rollback + +- name: Read the node name group + ansible.builtin.set_fact: + name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}" + tags: config_rollback + +- name: Group the functional_groups + ansible.builtin.set_fact: + tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}" + tags: config_rollback + +- name: Re-organize the groups + ansible.builtin.set_fact: + grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}" + loop: "{{ tmp_grouped_nodes }}" + tags: config_rollback + +- name: Assign slurm lists + ansible.builtin.set_fact: + ctld_list: "{{ grouped_nodes | dict2items + | selectattr('key', 'match', '^' ~ 'slurm_control_node_') + | map(attribute='value') | list | flatten }}" + tags: config_rollback + +- name: Fail if Slurm controller list is empty + ansible.builtin.fail: + msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun." + when: ctld_list | length == 0 + tags: config_rollback + +- name: Set slurm controller IP + ansible.builtin.set_fact: + controller_ip: "{{ ip_name_map[ctld_list | first] }}" + when: ctld_list | length > 0 + tags: config_rollback + +- name: Add slurm controller as dynamic host + ansible.builtin.add_host: + name: slurm_controller + ansible_host: "{{ controller_ip }}" + ansible_user: root + ansible_port: 22 + when: controller_ip is defined + tags: config_rollback + +- name: Set slurm paths + ansible.builtin.set_fact: + slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" + slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}" + tags: config_rollback + +- name: Find available backups + ansible.builtin.find: + paths: "{{ slurm_backups_root }}" + file_type: directory + depth: 1 + register: backup_dirs + tags: config_rollback + +- name: Fail if no backups found + ansible.builtin.fail: + msg: "No backups found in {{ slurm_backups_root }}" + when: backup_dirs.files | length == 0 + tags: config_rollback + +- name: Set rollback backup list limit + ansible.builtin.set_fact: + rollback_backup_list_limit_effective: "{{ lookup('vars', 'rollback_backup_list_limit', default=slurm_rollback_backup_list_limit_default) | int }}" + tags: config_rollback + +- name: Build backup choices + ansible.builtin.set_fact: + backup_choices: >- + {{ + ( + backup_dirs.files + | sort(attribute='mtime', reverse=true) + | map(attribute='path') + | list + )[:(rollback_backup_list_limit_effective | int)] + }} + total_backup_count: "{{ backup_dirs.files | length }}" + tags: config_rollback + +- name: Notify if backup list is truncated + ansible.builtin.debug: + msg: "Showing latest {{ rollback_backup_list_limit_effective }} backups out of {{ total_backup_count }}. Increase rollback_backup_list_limit to show more." + when: (total_backup_count | int) > (rollback_backup_list_limit_effective | int) + tags: config_rollback + +- name: Display backup list order + ansible.builtin.debug: + msg: "Backup list is sorted latest first." + tags: config_rollback + +- name: Show backup choices + ansible.builtin.debug: + msg: "{{ backup_choice_index + 1 }}: {{ item | basename }}" + loop: "{{ backup_choices }}" + loop_control: + index_var: backup_choice_index + tags: config_rollback + +- name: Prompt user to select backup number + ansible.builtin.pause: + prompt: "Enter the backup number to rollback to" + register: backup_choice_input + tags: config_rollback + +- name: Set backup choice index + ansible.builtin.set_fact: + backup_choice_index: "{{ backup_choice_input.user_input | default('') | trim }}" + tags: config_rollback + +- name: Fail if backup selection is empty + ansible.builtin.fail: + msg: "No backup number selected. Rollback aborted." + when: backup_choice_index | length == 0 + tags: config_rollback + +- name: Validate backup choice input is within range + ansible.builtin.fail: + msg: "Invalid selection '{{ backup_choice_input.user_input | default('') }}'. Enter a number between 1 and {{ backup_choices | length }}." + when: + - (backup_choice_index | int) < 1 or (backup_choice_index | int) > (backup_choices | length) + tags: config_rollback + +- name: Set selected backup + ansible.builtin.set_fact: + selected_backup_dir: "{{ backup_choices[(backup_choice_index | int) - 1] }}" + tags: config_rollback + +- name: Set selected backup controller root + ansible.builtin.set_fact: + selected_backup_ctld_root: "{{ selected_backup_dir }}/{{ ctld_list[0] }}" + tags: config_rollback + +- name: Check slurm.conf exists in selected backup + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/etc/slurm/slurm.conf" + register: slurm_conf_stat + tags: config_rollback + +- name: Fail if slurm.conf missing in backup + ansible.builtin.fail: + msg: "Selected backup is missing {{ ctld_list[0] }}/etc/slurm/slurm.conf" + when: not slurm_conf_stat.stat.exists + tags: config_rollback + +- name: Check key slurm conf files existence in selected backup + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/etc/slurm/{{ item }}" + loop: + - slurmdbd.conf + - cgroup.conf + - gres.conf + register: slurm_conf_files_stats + tags: config_rollback + +- name: Compute missing slurm conf files in selected backup + ansible.builtin.set_fact: + missing_slurm_conf_files: "{{ slurm_conf_files_stats.results | rejectattr('stat.exists') | map(attribute='item') | list }}" + tags: config_rollback + +- name: Warn if slurm conf files are missing in selected backup + ansible.builtin.debug: + msg: "WARNING: Missing files in selected backup under etc/slurm: {{ missing_slurm_conf_files }}" + when: missing_slurm_conf_files | length > 0 + tags: config_rollback + +- name: Prompt to continue if slurm conf files are missing + ansible.builtin.pause: + prompt: "Some slurm config files are missing in the selected backup. Continue anyway? (y/N)" + register: continue_missing_confs + when: missing_slurm_conf_files | length > 0 + tags: config_rollback + +- name: Fail if user does not want to continue with missing slurm conf files + ansible.builtin.fail: + msg: "Rollback aborted" + when: + - missing_slurm_conf_files | length > 0 + - continue_missing_confs.user_input | default('N') | lower != 'y' + tags: config_rollback + +- name: Check munge.key exists in selected backup + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/etc/munge/munge.key" + register: munge_key_stat + tags: config_rollback + +- name: Warn if munge.key is missing in selected backup + ansible.builtin.debug: + msg: "WARNING: munge.key is missing in selected backup under etc/munge." + when: not munge_key_stat.stat.exists + tags: config_rollback + +- name: Prompt to continue if munge.key is missing + ansible.builtin.pause: + prompt: "munge.key is missing in the selected backup. Continue anyway? (y/N)" + register: continue_missing_munge_key + when: not munge_key_stat.stat.exists + tags: config_rollback + +- name: Fail if user does not want to continue without munge.key + ansible.builtin.fail: + msg: "Rollback aborted" + when: + - not munge_key_stat.stat.exists + - continue_missing_munge_key.user_input | default('N') | lower != 'y' + tags: config_rollback + +- name: Check backup directories + ansible.builtin.stat: + path: "{{ selected_backup_ctld_root }}/{{ item }}" + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + register: backup_dir_stats + tags: config_rollback + +- name: Compute missing backup directories + ansible.builtin.set_fact: + missing_backup_dirs: "{{ backup_dir_stats.results | rejectattr('stat.exists') | map(attribute='item') | list }}" + tags: config_rollback + +- name: Warn if backup directories missing + ansible.builtin.debug: + msg: "WARNING: Missing directories in backup: {{ missing_backup_dirs }}" + when: missing_backup_dirs | length > 0 + tags: config_rollback + +- name: Prompt to continue if backup directories missing + ansible.builtin.pause: + prompt: "Some directories are missing in the backup. Continue anyway? (y/N)" + register: continue_missing + when: missing_backup_dirs | length > 0 + tags: config_rollback + +- name: Fail if user does not want to continue + ansible.builtin.fail: + msg: "Rollback aborted" + when: + - missing_backup_dirs | length > 0 + - continue_missing.user_input | default('N') | lower != 'y' + tags: config_rollback + +- name: Prompt for safety backup before rollback + ansible.builtin.pause: + prompt: "Create a safety backup of current state before rollback? (y/n)" + register: pre_rollback_backup + tags: config_rollback + +- name: Set pre-rollback backup choice + ansible.builtin.set_fact: + pre_rollback_backup_choice: "{{ pre_rollback_backup.user_input | default('') | trim | lower }}" + tags: config_rollback + +- name: Fail if pre-rollback backup choice is empty + ansible.builtin.fail: + msg: "No input provided for safety backup prompt. Rollback aborted." + when: pre_rollback_backup_choice | length == 0 + tags: config_rollback + +- name: Validate pre-rollback backup choice + ansible.builtin.fail: + msg: "Invalid input '{{ pre_rollback_backup.user_input | default('') }}'. Enter 'y' or 'n'." + when: pre_rollback_backup_choice not in ['y', 'yes', 'n', 'no'] + tags: config_rollback + +- name: Run safety backup before rollback + ansible.builtin.include_role: + name: slurm_config_backup + apply: + tags: config_rollback + when: pre_rollback_backup_choice in ['y', 'yes'] + tags: config_rollback + +- name: Stat slurmdbd.conf before restore + ansible.builtin.stat: + path: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/slurmdbd.conf" + checksum_algorithm: sha1 + register: slurmdbd_before + tags: config_rollback + +- name: Restore config directories + ansible.builtin.command: >- + rsync -a "{{ selected_backup_ctld_root }}/{{ item }}/" "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/" + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + changed_when: true + failed_when: false + tags: config_rollback + +- name: Check slurmdbd.conf permissions after restore + ansible.builtin.stat: + path: /etc/slurm/slurmdbd.conf + delegate_to: slurm_controller + register: slurmdbd_conf_perm_stat + tags: config_rollback + +- name: Fix slurmdbd.conf permissions after restore + ansible.builtin.file: + path: /etc/slurm/slurmdbd.conf + mode: '0600' + delegate_to: slurm_controller + when: slurmdbd_conf_perm_stat.stat.exists + tags: config_rollback + +- name: Check munge.key permissions after restore + ansible.builtin.stat: + path: /etc/munge/munge.key + delegate_to: slurm_controller + register: munge_key_perm_stat + tags: config_rollback + +- name: Fix munge.key permissions after restore + ansible.builtin.file: + path: /etc/munge/munge.key + mode: '0400' + delegate_to: slurm_controller + when: munge_key_perm_stat.stat.exists + tags: config_rollback + +- name: Stat slurmdbd.conf after restore + ansible.builtin.stat: + path: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/slurmdbd.conf" + checksum_algorithm: sha1 + register: slurmdbd_after + tags: config_rollback + +- name: Check slurmctld is active before reconfigure + ansible.builtin.command: systemctl is-active slurmctld + delegate_to: slurm_controller + register: slurmctld_active + changed_when: false + failed_when: false + tags: config_rollback + +- name: Fail if slurmctld is not active + ansible.builtin.fail: + msg: "slurmctld is not active on the controller. Rollback applied on disk, but cannot reconfigure until slurmctld is running. Verify munge and slurmctld services and restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the controller." + when: slurmctld_active.stdout | default('') | trim != 'active' + tags: config_rollback + +- name: Run scontrol reconfigure + tags: config_rollback + block: + - name: Execute scontrol reconfigure + ansible.builtin.command: scontrol reconfigure + delegate_to: slurm_controller + register: reconfigure_out + changed_when: true + failed_when: reconfigure_out.rc != 0 + rescue: + - name: Display scontrol reconfigure error + ansible.builtin.debug: + msg: "scontrol reconfigure failed. stdout={{ reconfigure_out.stdout | default('') }} stderr={{ reconfigure_out.stderr | default('') }}" + + - name: Fail with rollback guidance + ansible.builtin.fail: + msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)." + +- name: Prompt to restart slurmdbd if slurmdbd.conf changed + ansible.builtin.pause: + prompt: "slurmdbd.conf has changed. Restart slurmdbd now? (Y/n)" + register: restart_slurmdbd_prompt + when: + - slurmdbd_before.stat.exists + - slurmdbd_after.stat.exists + - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum + tags: config_rollback + +- name: Restart slurmdbd + ansible.builtin.command: systemctl restart slurmdbd + delegate_to: slurm_controller + when: + - slurmdbd_before.stat.exists + - slurmdbd_after.stat.exists + - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum + - restart_slurmdbd_prompt.user_input | default('Y') | lower != 'n' + changed_when: true + tags: config_rollback diff --git a/utils/slurm_config_util.yml b/utils/slurm_config_util.yml new file mode 100644 index 0000000000..7cb5249ccd --- /dev/null +++ b/utils/slurm_config_util.yml @@ -0,0 +1,26 @@ +--- + +- name: Include input project directory + when: not project_dir_status | default(false) | bool + ansible.builtin.import_playbook: include_input_dir.yml + vars: + omnia_metadata_support: true + tags: always + +- name: Create oim group + ansible.builtin.import_playbook: create_container_group.yml + vars: + oim_group: true + tags: always + +- name: Slurm config utilities + hosts: oim + connection: ssh + gather_facts: true + roles: + - role: slurm_config_backup + tags: config_backup + - role: slurm_cleanup + tags: slurm_cleanup + - role: slurm_config_rollback + tags: config_rollback From b7071143ae8eb6f05be29c10efc53cfe78ec35dc Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 16:43:28 +0530 Subject: [PATCH 049/172] Fixed ansible lint issues --- .../tasks/transform_network_spec.yml | 16 +++++++++++++++- .../roles/import_input_parameters/vars/main.yml | 3 ++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml index d4b3a92e29..17e742d22f 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml @@ -131,12 +131,26 @@ (network_spec_21.Networks is not defined) or ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | list | length) == 0) +- name: Extract ib_network subnet from transformed output + ansible.builtin.set_fact: + ib_network_subnet: >- + {{ + ( + network_spec_21.Networks + | select('mapping') + | selectattr('ib_network', 'defined') + | map(attribute='ib_network') + | first + | default({}) + ).subnet | default('') + }} + - name: Validate mandatory ib_network.subnet is present in transformed output ansible.builtin.fail: msg: "{{ msg_ib_subnet_missing }}" when: - >- - ((network_spec_21.Networks | select('mapping') | selectattr('ib_network', 'defined') | map(attribute='ib_network') | first | default({})).subnet | default('') | string | trim) == '' + (ib_network_subnet | string | trim) == '' - name: Display transformation summary ansible.builtin.debug: diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index a87c855751..89fddebea3 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -85,4 +85,5 @@ restore_input_files: validate_cmd: "python3 -m json.tool '{{ input_project_dir }}/software_config.json'" - name: pxe_mapping_file.csv mode: '0644' - validate_cmd: "" \ No newline at end of file + validate_cmd: "" + \ No newline at end of file From fa4662a758aeb85f8ee7e3da40994e393ba2f86a Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 16:53:38 +0530 Subject: [PATCH 050/172] Update main.yml --- upgrade/roles/import_input_parameters/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 89fddebea3..126f158e6e 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -86,4 +86,4 @@ restore_input_files: - name: pxe_mapping_file.csv mode: '0644' validate_cmd: "" - \ No newline at end of file + From bae4c11877ab2af39ce9feb5694e72d99000ee75 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 6 Feb 2026 17:30:25 +0530 Subject: [PATCH 051/172] Input validation with type check for basic types --- .../common_utils/slurm_conf_utils.py | 71 ++++++++++++++++++- .../validation_flows/common_validation.py | 7 +- .../slurm_config/tasks/check_ctld_running.yml | 1 + discovery/roles/slurm_config/tasks/confs.yml | 22 ------ discovery/roles/slurm_config/vars/main.yml | 2 - 5 files changed, 75 insertions(+), 28 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 8deb85febb..faf1b54ff0 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -525,8 +525,9 @@ class SlurmParserEnum(str, Enum): "Link": S_P_STRING, # Communication link IDs "Links": S_P_CSV, # Communication link IDs "MultipleFiles": S_P_CSV, # list of GRES device files - "Name": S_P_STRING, # Gres name - "Type": S_P_STRING # Gres type (e.g. model name) + "Type": S_P_STRING, # Gres type (e.g. model name) + "Name": S_P_ARRAY, # Gres name + "NodeName": S_P_ARRAY } all_confs = { @@ -546,6 +547,72 @@ class SlurmParserEnum(str, Enum): _HOSTLIST_RE = re.compile( r'^(?P[^\[\]]*)\[(?P[^\[\]]+)\](?P.*)$') +def validate_config_types(conf_dict, conf_name): + """Validate configuration keys and value types based on SlurmParserEnum.""" + current_conf = all_confs.get(conf_name, {}) + invalid_keys = set(conf_dict.keys()).difference(set(current_conf.keys())) + type_errors = [] + + for key, value in conf_dict.items(): + if key in current_conf: + expected_type_enum = current_conf[key] + expected_type = expected_type_enum.value + error = None + + if expected_type == "int": + if not isinstance(value, int): + try: + int(str(value)) + except (ValueError, TypeError): + error = f"Expected integer, got {type(value).__name__}" + + elif expected_type == "float": + if not isinstance(value, (int, float)): + try: + float(str(value)) + except (ValueError, TypeError): + error = f"Expected float, got {type(value).__name__}" + + elif expected_type == "bool": + if not isinstance(value, bool): + if str(value).lower() not in ['yes', 'no', 'true', 'false', '0', '1']: + error = f"Expected boolean, got {type(value).__name__}" + + elif expected_type == "str": + if not isinstance(value, str): + error = f"Expected string, got {type(value).__name__}" + + elif expected_type == "csv": + if not isinstance(value, str): + error = f"Expected CSV string, got {type(value).__name__}" + + elif expected_type == "list": + if not isinstance(value, list): + error = f"Expected list, got {type(value).__name__}" + + elif expected_type == "array": + if not isinstance(value, list): + error = f"Expected array (list), got {type(value).__name__}" + elif value and not all(isinstance(item, dict) for item in value): + error = "Expected array of dicts, got mixed types" + + elif expected_type == "object": + if not isinstance(value, (dict, object)): + error = f"Expected object, got {type(value).__name__}" + + if error: + type_errors.append({ + "error_key": "omnia_config.yml", + "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'", + "error_value": "slurm_cluster config_sources" + }) + + return { + 'invalid_keys': list(invalid_keys), + 'type_errors': type_errors, + 'valid': len(invalid_keys) == 0 and len(type_errors) == 0 + } + def get_invalid_keys(conf_dict, conf_name): """Get invalid configuration keys by comparing against expected keys.""" current_conf = all_confs.get(conf_name, {}) diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 52fea1ced5..2eafc3884d 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -42,7 +42,8 @@ ) from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import ( parse_slurm_conf, - get_invalid_keys + get_invalid_keys, + validate_config_types ) file_names = config.files @@ -1072,7 +1073,9 @@ def validate_omnia_config( else: # path and also exists conf_dict = parse_slurm_conf(v, k, False) # module.exit_json(failed=True, result=conf_dict) - invalid_keys = get_invalid_keys(conf_dict, k) + # invalid_keys = get_invalid_keys(conf_dict, k) + type_errors = validate_config_types(conf_dict, k) + module.exit_json(failed=True, result=type_errors) if invalid_keys: errors.append( create_error_msg(input_file_path, "slurm_cluster config_sources", diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index dacd879bf7..52984c2afb 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -21,6 +21,7 @@ delegate_to: localhost register: ssh_check ignore_errors: true + ignore_unreachable: true - name: Block when ssh_check is success when: ssh_check is success diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 1ff30acf34..fdf461f88c 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -152,28 +152,6 @@ ansible.builtin.set_fact: conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}" -- name: Create backup directory with timestamp - ansible.builtin.file: - path: "{{ backup_dir }}" - state: directory - mode: '0755' - owner: "{{ slurm_user }}" - group: "{{ slurm_user_group }}" - when: ctld_list - -- name: Backup existing SLURM configuration files with timestamp - ansible.builtin.copy: - src: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - dest: "{{ backup_dir }}/{{ item.item.key }}.conf" - remote_src: true - mode: preserve - loop: "{{ merged_conf.results }}" - when: - - ctld_list - - item.item.key in conf_files - register: backup_results - failed_when: false - - name: Write merged .conf ansible.builtin.copy: content: "{{ item.ini_lines | join('\n') }}\n" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 9722725a88..939e3ac204 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -128,5 +128,3 @@ offline_path_aarch64: dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" ssh_private_key_path: /root/.ssh/oim_rsa - -backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" From 69b722625b1fa162c1f7cb5ae5b982df19182781 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 17:30:43 +0530 Subject: [PATCH 052/172] Update main.yml --- upgrade/roles/import_input_parameters/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 126f158e6e..b208b154ca 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -backup_location: /opt/omnia/backups/upgrade/input +backup_location: /opt/omnia/backups/upgrade/input/project_default backup_dir_mode: '0755' default_file_mode: '0644' From a2b49d8ed0f87edfbaa555b112708440c615066d Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 6 Feb 2026 17:33:33 +0530 Subject: [PATCH 053/172] Update main.yml --- upgrade/roles/import_input_parameters/vars/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index b208b154ca..bc4ca7430a 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -86,4 +86,3 @@ restore_input_files: - name: pxe_mapping_file.csv mode: '0644' validate_cmd: "" - From b096afb38145eef09fe01bc967463dfa18cbbe35 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Fri, 6 Feb 2026 17:58:09 +0530 Subject: [PATCH 054/172] Update omnia.sh --- omnia.sh | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/omnia.sh b/omnia.sh index 358cde2162..b2da6b6024 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1212,8 +1212,9 @@ phase1_validate() { return 1 fi - echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)" + echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)" + if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" @@ -1264,8 +1265,61 @@ phase2_approval() { return 0 } +phase3_backup_creation() { + local backup_base="$1" + + echo "[INFO] [ORCHESTRATOR] Phase 3: Backup Creation" + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Cannot create backup because omnia_core is not running" + return 1 + fi + + if [ -z "$backup_base" ]; then + echo "[ERROR] [ORCHESTRATOR] Backup destination is empty" + return 1 + fi + + if ! podman exec -u root omnia_core bash -c " + set -e + rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' + mkdir -p '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' + + if [ -f '$OMNIA_INPUT_DIR/default.yml' ]; then + cp -a '$OMNIA_INPUT_DIR/default.yml' '${backup_base%/}/input/' + fi + + if [ -d '$OMNIA_INPUT_DIR/project_default' ]; then + cp -a '$OMNIA_INPUT_DIR/project_default' '${backup_base%/}/input/' + fi + + if [ ! -f '$OMNIA_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 + exit 1 + fi + cp -a '$OMNIA_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml' + "; then + echo "[ERROR] [ORCHESTRATOR] Backup failed; cleaning up partial backup" + podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true + return 1 + fi + + if [ -f "/etc/containers/systemd/omnia_core.container" ]; then + if ! podman cp "/etc/containers/systemd/omnia_core.container" "omnia_core:${backup_base%/}/configs/omnia_core.container" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Failed to backup quadlet container file" + podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true + return 1 + fi + fi + + echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_base" + echo "[INFO] [ORCHESTRATOR] Phase 3: Backup completed" + return 0 +} + upgrade_omnia_core() { local lock_file="/var/lock/omnia_core_upgrade.lock" + local backup_base if [ -e "$lock_file" ]; then echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}" @@ -1288,7 +1342,18 @@ upgrade_omnia_core() { exit 0 fi - echo "[INFO] [ORCHESTRATOR] Upgrade tasks for backup and container swap are deferred to a follow-up PR" + backup_base="$OMNIA_UPGRADE_BACKUP_PATH" + if [ -z "$backup_base" ]; then + echo "[ERROR] [ORCHESTRATOR] Backup path is empty" + exit 1 + fi + + if ! phase3_backup_creation "$backup_base"; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3" + exit 1 + fi + + echo "[INFO] [ORCHESTRATOR] Upgrade tasks for container swap are deferred to a follow-up PR" exit 0 } From ea05124537adc8152b5fcfff60a2ad1f98a99c41 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Fri, 6 Feb 2026 18:06:15 +0530 Subject: [PATCH 055/172] Update omnia.sh --- omnia.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omnia.sh b/omnia.sh index b2da6b6024..0239ebdf3f 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1212,7 +1212,7 @@ phase1_validate() { return 1 fi - echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)" + echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)" From ab1b7cd147fcecddf935caf3e6343f74963d0c0b Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Fri, 6 Feb 2026 18:06:31 +0530 Subject: [PATCH 056/172] add warning for user_registry and remove user_registry from input validation Signed-off-by: Katakam-Rakesh --- .../validation_flows/local_repo_validation.py | 44 ---------------- local_repo/local_repo.yml | 2 +- .../check_additional_packages_images.yml | 50 +++++++++++++++++++ .../tasks/check_images_per_arch.yml | 43 ++++++++++++++++ local_repo/roles/validation/tasks/main.yml | 3 ++ local_repo/roles/validation/vars/main.yml | 3 ++ 6 files changed, 100 insertions(+), 45 deletions(-) create mode 100644 local_repo/roles/validation/tasks/check_additional_packages_images.yml create mode 100644 local_repo/roles/validation/tasks/check_images_per_arch.yml diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index efeda63c8a..bcec9f4197 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -129,50 +129,6 @@ def validate_local_repo_config(input_file_path, data, software_config_file_path = create_file_path(input_file_path, file_names["software_config"]) software_config_json = load_json(software_config_file_path) - # Check if additional_packages is enabled and contains image packages - additional_packages_enabled = any(sw.get("name") == "additional_packages" for sw in software_config_json.get("softwares", [])) - if additional_packages_enabled: - # Get arch values from additional_packages entry in software_config.json - additional_packages_archs = [] - for software in software_config_json.get("softwares", []): - if software.get("name") == "additional_packages": - arch_list = software.get("arch", []) - additional_packages_archs = arch_list # Get all archs - break - - # Check each arch specific additional_packages.json - has_image_packages = False - for additional_packages_arch in additional_packages_archs: - additional_packages_path = create_file_path( - input_file_path, - f"config/{additional_packages_arch}/{software_config_json['cluster_os_type']}/{software_config_json['cluster_os_version']}/additional_packages.json" - ) - - if os.path.exists(additional_packages_path): - additional_packages_data = load_json(additional_packages_path) - has_image_packages = False - - # Check all sections for image packages - for section_name, section_data in additional_packages_data.items(): - if isinstance(section_data, dict) and "cluster" in section_data: - cluster_packages = section_data.get("cluster", []) - - for package in cluster_packages: - if package.get("type") == "image": - has_image_packages = True - break - - if has_image_packages: - break - - # If any architecture has image packages, user_registry must be defined and not empty - if has_image_packages and user_registry is None: - errors.append(create_error_msg( - local_repo_yml, - "user_registry", - "user_registry must be defined when additional_packages.json contains packages of type 'image'" - )) - # Extra validation: custom_slurm must have _slurm_custom in user_repo_url_ for sw in software_config_json["softwares"]: if sw["name"] == "slurm_custom": diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml index cb394fa845..3a743c3f47 100644 --- a/local_repo/local_repo.yml +++ b/local_repo/local_repo.yml @@ -114,7 +114,7 @@ connection: ssh gather_facts: false tasks: - - name: Read network_spec vars + - name: Validate Pulp Container and Endpoint ansible.builtin.include_role: name: pulp_validation diff --git a/local_repo/roles/validation/tasks/check_additional_packages_images.yml b/local_repo/roles/validation/tasks/check_additional_packages_images.yml new file mode 100644 index 0000000000..3b5663095b --- /dev/null +++ b/local_repo/roles/validation/tasks/check_additional_packages_images.yml @@ -0,0 +1,50 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load local_repo_config.yml + ansible.builtin.include_vars: + file: "{{ local_repo_config_file }}" + name: local_repo_config + +- name: Check if additional_packages is enabled in software_config + ansible.builtin.set_fact: + additional_packages_enabled: "{{ software | selectattr('name', 'equalto', 'additional_packages') | list | length > 0 }}" + +- name: Get additional_packages architectures + ansible.builtin.set_fact: + additional_packages_archs: "{{ (software | selectattr('name', 'equalto', 'additional_packages') | first).arch | default([]) }}" + when: additional_packages_enabled + +- name: Check for image packages in additional_packages.json + when: additional_packages_enabled + block: + - name: Initialize image found flag + ansible.builtin.set_fact: + has_image_packages: false + + - name: Check each architecture for image packages + ansible.builtin.include_tasks: check_images_per_arch.yml + loop: "{{ additional_packages_archs }}" + loop_control: + loop_var: arch_item + when: additional_packages_archs is defined + + - name: Display warning if images found in additional_packages.json but user_registry not defined + ansible.builtin.pause: + prompt: "{{ additional_packages_image_warning_msg }}" + seconds: "{{ warning_wait_time_warning }}" + when: + - has_image_packages | bool + - local_repo_config.user_registry is not defined or local_repo_config.user_registry is none or local_repo_config.user_registry | length == 0 diff --git a/local_repo/roles/validation/tasks/check_images_per_arch.yml b/local_repo/roles/validation/tasks/check_images_per_arch.yml new file mode 100644 index 0000000000..aa20840e3e --- /dev/null +++ b/local_repo/roles/validation/tasks/check_images_per_arch.yml @@ -0,0 +1,43 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set additional_packages.json path for {{ arch_item }} + ansible.builtin.set_fact: + additional_packages_path: "{{ project_input_path }}/config/{{ arch_item }}/{{ cluster_os_type }}/{{ cluster_os_version }}/additional_packages.json" + +- name: Check if additional_packages.json exists for {{ arch_item }} + ansible.builtin.stat: + path: "{{ additional_packages_path }}" + register: additional_packages_file + +- name: Load and check additional_packages.json for {{ arch_item }} + when: additional_packages_file.stat.exists + block: + - name: Load additional_packages.json + ansible.builtin.include_vars: + file: "{{ additional_packages_path }}" + name: additional_packages_data + + - name: Check for image type packages in additional_packages + ansible.builtin.set_fact: + has_image_packages: true + when: > + additional_packages_data | dict2items | + selectattr('value.cluster', 'defined') | + map(attribute='value.cluster') | + flatten | + selectattr('type', 'defined') | + selectattr('type', 'equalto', 'image') | + list | length > 0 diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml index ea9c61aeb5..41f584dd15 100644 --- a/local_repo/roles/validation/tasks/main.yml +++ b/local_repo/roles/validation/tasks/main.yml @@ -22,6 +22,9 @@ - name: Validate software_config.json ansible.builtin.include_tasks: validate_software_config_json.yml +- name: Check for images in additional_packages + ansible.builtin.include_tasks: check_additional_packages_images.yml + - name: Validate metadata ansible.builtin.include_tasks: validate_metadata.yml diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index ec343cb3ef..08a082ded7 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -146,6 +146,9 @@ user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry shoul time_out: 30 user_registry_msg: "Above user registries is/are not reachable. Please make sure the user registry is accessible from the Omnia Infrastructure Manager." # noqa: yaml[line-length] cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in {{ project_input_path }}/local_repo_config.yml" # noqa: yaml[line-length] +additional_packages_image_warning_msg: | + WARNING: additional_packages.json contains packages of type 'image', but 'user_registry' is not defined in local_repo_config.yml. + Please specify 'user_registry' in local_repo_config.yml if these images are coming from a user registry. # Usage: validate_user_repo_url.yml user_repo_url_fail_msg: "Failed. Please ensure user_repo_url is proper and should not have jinja variables. From fc6f0e1653656d12e21c745bd89adbf55066de34 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 6 Feb 2026 18:26:30 +0530 Subject: [PATCH 057/172] Error messages formatted --- .../common_utils/slurm_conf_utils.py | 2 +- .../validation_flows/common_validation.py | 28 +++++++++---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index faf1b54ff0..22b38d7ad3 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -604,7 +604,7 @@ def validate_config_types(conf_dict, conf_name): type_errors.append({ "error_key": "omnia_config.yml", "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'", - "error_value": "slurm_cluster config_sources" + "error_value": "slurm_cluster->config_sources" }) return { diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 2eafc3884d..8c850effbf 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1065,29 +1065,27 @@ def validate_omnia_config( cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] for cfg_path_dict in cnfg_src: for k,v in cfg_path_dict.items(): + conf_dict = None if isinstance(v, str): if not os.path.exists(v): errors.append( create_error_msg(input_file_path, "slurm_cluster config_sources", f"provided conf path for {k} - {v} does not exist")) - else: # path and also exists + continue + else: # path exists conf_dict = parse_slurm_conf(v, k, False) - # module.exit_json(failed=True, result=conf_dict) - # invalid_keys = get_invalid_keys(conf_dict, k) - type_errors = validate_config_types(conf_dict, k) - module.exit_json(failed=True, result=type_errors) - if invalid_keys: - errors.append( - create_error_msg(input_file_path, "slurm_cluster config_sources", - f"invalid keys found in {k} - {invalid_keys}")) else: - invalid_keys = get_invalid_keys(v, k) - if invalid_keys: + conf_dict = v + + # Validate config types once for both cases + if conf_dict: + validation_result = validate_config_types(conf_dict, k) + if validation_result['type_errors']: + errors.extend(validation_result['type_errors']) + if validation_result['invalid_keys']: errors.append( - create_error_msg(input_file_path, "slurm_cluster config_sources", - f"invalid keys found in {k} - {invalid_keys}")) - - + create_error_msg('omnia_config.yml', "slurm_cluster->config_sources", + f"{k}.conf invalid keys found - {','.join(validation_result['invalid_keys'])}")) return errors def check_is_service_cluster_functional_groups_defined( From c54886f03c21d5b266c1d983892b241e605e2fba Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Fri, 6 Feb 2026 18:29:01 +0530 Subject: [PATCH 058/172] lint fix Signed-off-by: Katakam-Rakesh --- .../module_utils/local_repo/software_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index f1840be158..61b0eb31b4 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -625,14 +625,14 @@ def parse_json_data(file_path, package_types,logger, failed_list=None, subgroup_ for item in value: # For every image, check if it is present in Pulp if is_additional_packages and item.get("type") == "image": - logger.info("Calling function to check %s existence in Pulp", item) - tag_missing_entry = check_additional_image_in_pulp(item, logger) - logger.info("tag_missing_entry: %s", tag_missing_entry) - if tag_missing_entry == {}: - continue - if tag_missing_entry: - filtered_list.append(tag_missing_entry) + logger.info("Calling function to check %s existence in Pulp", item) + tag_missing_entry = check_additional_image_in_pulp(item, logger) + logger.info("tag_missing_entry: %s", tag_missing_entry) + if tag_missing_entry == {}: continue + if tag_missing_entry: + filtered_list.append(tag_missing_entry) + continue # Get package name pkg_name = item.get("package") From b1965a4074be43f85c103eb360e2926fe25ff471 Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Fri, 6 Feb 2026 18:40:16 +0530 Subject: [PATCH 059/172] lint fix Signed-off-by: Katakam-Rakesh --- .../local_repo/process_parallel.py | 2 +- .../module_utils/local_repo/software_utils.py | 27 +++++++++---------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/common/library/module_utils/local_repo/process_parallel.py b/common/library/module_utils/local_repo/process_parallel.py index 74a24504b7..2c55098c98 100644 --- a/common/library/module_utils/local_repo/process_parallel.py +++ b/common/library/module_utils/local_repo/process_parallel.py @@ -96,7 +96,7 @@ def load_docker_credentials(vault_yml_path, vault_password_file): if response.status_code == 200: return docker_username, docker_password - + if response.status_code == 429: raise RuntimeError("Docker Hub rate limit exceeded. Please try again later.") diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index 61b0eb31b4..0924452d32 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -230,7 +230,7 @@ def parse_repo_urls(repo_config, local_repo_config_path, logger.info(f"Processing repository URLs for architectures: {archs_to_process}") for arch in archs_to_process: - + # Always ensure these are lists rhel_repo_entry[arch] = list(local_yaml.get(f"rhel_os_url_{arch}") or []) repo_entries[arch] = list(local_yaml.get(f"omnia_repo_url_rhel_{arch}") or []) @@ -338,8 +338,8 @@ def parse_repo_urls(repo_config, local_repo_config_path, seen_urls = set() for arch, entries in repo_entries.items(): if not entries: - logger.info(f"No OMNIA repository entries found for {arch}") - continue + logger.info(f"No OMNIA repository entries found for {arch}") + continue for repo in entries: name = repo.get("name", "unknown") @@ -455,7 +455,7 @@ def get_subgroup_dict(user_data,logger): for item in user_data.get(software_name, [])] subgroup_dict[software_name] = subgroups if isinstance( user_data.get(software_name), list) else [sw['name']] - + logger.info("Completed get_subgroup_dict(). Found %d software entries.", len(software_names)) logger.info("Final subgroup_dict: %s", subgroup_dict) @@ -479,17 +479,17 @@ def get_csv_software(file_name): """ csv_software = [] - + if not os.path.isfile(file_name): return csv_software - + with open(file_name, mode='r') as csv_file: reader = csv.DictReader(csv_file) csv_software = [row.get(CSV_COLUMNS["column1"], "").strip() for row in reader] return csv_software - + def get_failed_software(file_path): """ @@ -702,7 +702,6 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger): raise names = [row['name'] for row in status_csv_content] - # Read all packages from JSON try: all_packages = parse_json_data(json_path, PACKAGE_TYPES, logger,None, subgroup_list) @@ -710,18 +709,18 @@ def get_new_packages_not_in_status(json_path, csv_path, subgroup_list,logger): except Exception as e: logger.error("Failed to parse JSON file '%s': %s", json_path, e) raise - + for pkg in all_packages: if pkg["type"] == "image": # Check exact package:tag or package:digest combination pkg_base = pkg.get("package", "").strip() pkg_identifier = pkg_base - + if "tag" in pkg: pkg_identifier += f":{pkg['tag']}" elif "digest" in pkg: pkg_identifier += f":{pkg['digest']}" - + if pkg_identifier not in names: new_packages.append(pkg) else: @@ -753,7 +752,7 @@ def process_software(software, fresh_installation, json_path, csv_path, subgroup failed_packages = None logger.info("Fresh installation detected — skipping failed package check.") else: - try: + try: failed_packages = None if fresh_installation else get_failed_software(csv_path) logger.info("Failed packages: %s", failed_packages) except Exception as e: @@ -771,7 +770,7 @@ def process_software(software, fresh_installation, json_path, csv_path, subgroup raise else: logger.info("No failed RPM packages found for: %s", software) - + # Parse main JSON data try: combined = parse_json_data( @@ -803,7 +802,7 @@ def get_software_names_and_arch(json_data, arch): sw_arch = sw_arch_dict[sw["name"]] if arch in sw_arch: result.append(sw["name"]) - + return result def remove_duplicates_from_trans(trans): From 810a8fd0d6798866d2143f16bf7cd9aabede529b Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Fri, 6 Feb 2026 19:27:33 +0530 Subject: [PATCH 060/172] skip ib network configuration if mellanox card is not present Signed-off-by: Katakam-Rakesh --- .../templates/doca-ofed/configure-ib-network.sh.j2 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index 1cb95d6f9b..249b90b6a5 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -1,6 +1,12 @@ #!/bin/bash set -euo pipefail +# Check if Mellanox hardware is present +if ! lspci | grep -i 'mellanox'; then + echo "No Mellanox RDMA hardware detected. Skipping IB network configuration." + exit 0 +fi + ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" From c90040682c776bb5bfd73d220d88a2517b8cf0dd Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 14:03:54 +0000 Subject: [PATCH 061/172] rollback feature update --- utils/roles/slurm_config_rollback/tasks/main.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml index e9822de876..f3aa65a3ae 100644 --- a/utils/roles/slurm_config_rollback/tasks/main.yml +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -405,10 +405,9 @@ ansible.builtin.fail: msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)." -- name: Prompt to restart slurmdbd if slurmdbd.conf changed - ansible.builtin.pause: - prompt: "slurmdbd.conf has changed. Restart slurmdbd now? (Y/n)" - register: restart_slurmdbd_prompt +- name: Notify slurmdbd.conf changed + ansible.builtin.debug: + msg: "Detected slurmdbd.conf change after rollback; restarting slurmdbd." when: - slurmdbd_before.stat.exists - slurmdbd_after.stat.exists @@ -422,6 +421,5 @@ - slurmdbd_before.stat.exists - slurmdbd_after.stat.exists - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum - - restart_slurmdbd_prompt.user_input | default('Y') | lower != 'n' changed_when: true tags: config_rollback From dff8462a07b301db5004c7a886a795d8fc947fd1 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 14:29:25 +0000 Subject: [PATCH 062/172] slurmdbd service before scontrol reconfig --- .../roles/slurm_config_backup/tasks/main.yml | 8 +++- .../slurm_config_rollback/tasks/main.yml | 45 ++++++++++--------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/utils/roles/slurm_config_backup/tasks/main.yml b/utils/roles/slurm_config_backup/tasks/main.yml index 4871ab705b..4d01014180 100644 --- a/utils/roles/slurm_config_backup/tasks/main.yml +++ b/utils/roles/slurm_config_backup/tasks/main.yml @@ -69,11 +69,15 @@ backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" backup_base_name: "{{ backup_base_name_input.user_input | default('') }}" +- name: Set backup name suffix + ansible.builtin.set_fact: + backup_name_suffix: "{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}" + - name: Set backup directory ansible.builtin.set_fact: slurm_backups_root: "{{ share_path }}/{{ slurm_backups_dir_name }}" - backup_id: "{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}" - backup_dir: "{{ share_path }}/{{ slurm_backups_dir_name }}/{{ (backup_base_name | length > 0) | ternary(backup_base_name ~ '_' ~ backup_timestamp, backup_timestamp) }}" + backup_id: "{{ backup_name_suffix }}" + backup_dir: "{{ share_path }}/{{ slurm_backups_dir_name }}/{{ backup_name_suffix }}" - name: Ensure slurm backups root exists ansible.builtin.file: diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml index f3aa65a3ae..0610313e32 100644 --- a/utils/roles/slurm_config_rollback/tasks/main.yml +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -326,8 +326,11 @@ tags: config_rollback - name: Restore config directories - ansible.builtin.command: >- - rsync -a "{{ selected_backup_ctld_root }}/{{ item }}/" "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/" + ansible.builtin.copy: + src: "{{ selected_backup_ctld_root }}/{{ item }}/" + dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/" + remote_src: true + directory_mode: '0755' loop: - etc/slurm - etc/munge @@ -373,6 +376,25 @@ register: slurmdbd_after tags: config_rollback +- name: Notify slurmdbd.conf changed + ansible.builtin.debug: + msg: "Detected slurmdbd.conf change after rollback; restarting slurmdbd." + when: + - slurmdbd_before.stat.exists + - slurmdbd_after.stat.exists + - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum + tags: config_rollback + +- name: Restart slurmdbd + ansible.builtin.command: systemctl restart slurmdbd + delegate_to: slurm_controller + when: + - slurmdbd_before.stat.exists + - slurmdbd_after.stat.exists + - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum + changed_when: true + tags: config_rollback + - name: Check slurmctld is active before reconfigure ansible.builtin.command: systemctl is-active slurmctld delegate_to: slurm_controller @@ -404,22 +426,3 @@ - name: Fail with rollback guidance ansible.builtin.fail: msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)." - -- name: Notify slurmdbd.conf changed - ansible.builtin.debug: - msg: "Detected slurmdbd.conf change after rollback; restarting slurmdbd." - when: - - slurmdbd_before.stat.exists - - slurmdbd_after.stat.exists - - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum - tags: config_rollback - -- name: Restart slurmdbd - ansible.builtin.command: systemctl restart slurmdbd - delegate_to: slurm_controller - when: - - slurmdbd_before.stat.exists - - slurmdbd_after.stat.exists - - slurmdbd_before.stat.checksum != slurmdbd_after.stat.checksum - changed_when: true - tags: config_rollback From a7595104f839fb93eb9d1bbf34567082cfe0e854 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 15:00:46 +0000 Subject: [PATCH 063/172] lint issue for permission --- utils/roles/slurm_config_rollback/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml index 0610313e32..ee873529fd 100644 --- a/utils/roles/slurm_config_rollback/tasks/main.yml +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -330,6 +330,7 @@ src: "{{ selected_backup_ctld_root }}/{{ item }}/" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/" remote_src: true + mode: '0644' directory_mode: '0755' loop: - etc/slurm From 677182e63ad5822e68da3ae930131cb7f73ba2bf Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Fri, 6 Feb 2026 21:05:13 +0530 Subject: [PATCH 064/172] Fix host/container path handling for Omnia metadata and input --- omnia.sh | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/omnia.sh b/omnia.sh index 0239ebdf3f..f05c9ebe84 100755 --- a/omnia.sh +++ b/omnia.sh @@ -52,11 +52,15 @@ is_local_ip() { fi } -OMNIA_BASE_DIR="/opt/omnia" -OMNIA_INPUT_DIR="/opt/omnia/input" -OMNIA_BACKUPS_DIR="/opt/omnia/backups" -OMNIA_METADATA_DIR="/opt/omnia/.data" -OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" +# Container-side paths (used inside podman exec commands) +CONTAINER_INPUT_DIR="/opt/omnia/input" +CONTAINER_BACKUPS_DIR="/opt/omnia/backups" +CONTAINER_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" + +# Host-side paths (initialized dynamically after omnia_path is set) +OMNIA_INPUT_DIR="" +OMNIA_METADATA_DIR="" +OMNIA_METADATA_FILE="" update_metadata_upgrade_backup_dir() { local backup_dir="$1" @@ -68,14 +72,14 @@ update_metadata_upgrade_backup_dir() { podman exec -u root omnia_core bash -c " set -e - if [ ! -f '$OMNIA_METADATA_FILE' ]; then - echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 exit 1 fi - if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then - sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE' + if grep -q '^upgrade_backup_dir:' '$CONTAINER_METADATA_FILE'; then + sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$CONTAINER_METADATA_FILE' else - echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE' + echo 'upgrade_backup_dir: ${backup_dir}' >> '$CONTAINER_METADATA_FILE' fi " } @@ -560,6 +564,11 @@ init_container_config() { # Create the pulp_ha directory if it does not exist. echo -e "${GREEN} Creating the pulp HA directory if it does not exist.${NC}" mkdir -p "$omnia_path/omnia/pulp/pulp_ha" + + # Initialize host-side path variables based on user-provided omnia_path + OMNIA_INPUT_DIR="$omnia_path/omnia/input" + OMNIA_METADATA_DIR="$omnia_path/omnia/.data" + OMNIA_METADATA_FILE="$omnia_path/omnia/.data/oim_metadata.yml" } @@ -617,6 +626,11 @@ fetch_config() { else echo -e "${GREEN} Successfully fetched data from metadata file.${NC}" fi + + # Initialize host-side path variables based on fetched omnia_path + OMNIA_INPUT_DIR="$omnia_path/omnia/input" + OMNIA_METADATA_DIR="$omnia_path/omnia/.data" + OMNIA_METADATA_FILE="$omnia_path/omnia/.data/oim_metadata.yml" } # Validates the OIM (Omnia Infrastructure Manager) by checking if the hostname is @@ -1242,7 +1256,7 @@ phase2_approval() { echo " - Additional Package Installation" echo "============================================" - default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade" + default_backup_dir="$CONTAINER_BACKUPS_DIR/upgrade" backup_base="$default_backup_dir" echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base" @@ -1285,19 +1299,19 @@ phase3_backup_creation() { rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' mkdir -p '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs' - if [ -f '$OMNIA_INPUT_DIR/default.yml' ]; then - cp -a '$OMNIA_INPUT_DIR/default.yml' '${backup_base%/}/input/' + if [ -f '$CONTAINER_INPUT_DIR/default.yml' ]; then + cp -a '$CONTAINER_INPUT_DIR/default.yml' '${backup_base%/}/input/' fi - if [ -d '$OMNIA_INPUT_DIR/project_default' ]; then - cp -a '$OMNIA_INPUT_DIR/project_default' '${backup_base%/}/input/' + if [ -d '$CONTAINER_INPUT_DIR/project_default' ]; then + cp -a '$CONTAINER_INPUT_DIR/project_default' '${backup_base%/}/input/' fi - if [ ! -f '$OMNIA_METADATA_FILE' ]; then - echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 exit 1 fi - cp -a '$OMNIA_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml' + cp -a '$CONTAINER_METADATA_FILE' '${backup_base%/}/metadata/oim_metadata.yml' "; then echo "[ERROR] [ORCHESTRATOR] Backup failed; cleaning up partial backup" podman exec -u root omnia_core bash -c "rm -rf '${backup_base%/}/input' '${backup_base%/}/metadata' '${backup_base%/}/configs'" >/dev/null 2>&1 || true From 1345236d2ea7e5369f7ab10ee5dbf78f9d4e7343 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 15:41:36 +0000 Subject: [PATCH 065/172] lint issue for systemd --- .../slurm_config_rollback/tasks/main.yml | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml index ee873529fd..1d3b23ec68 100644 --- a/utils/roles/slurm_config_rollback/tasks/main.yml +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -326,12 +326,11 @@ tags: config_rollback - name: Restore config directories - ansible.builtin.copy: + ansible.posix.synchronize: src: "{{ selected_backup_ctld_root }}/{{ item }}/" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/" - remote_src: true - mode: '0644' - directory_mode: '0755' + archive: true + recursive: true loop: - etc/slurm - etc/munge @@ -387,7 +386,9 @@ tags: config_rollback - name: Restart slurmdbd - ansible.builtin.command: systemctl restart slurmdbd + ansible.builtin.systemd: + name: slurmdbd + state: restarted delegate_to: slurm_controller when: - slurmdbd_before.stat.exists @@ -396,18 +397,20 @@ changed_when: true tags: config_rollback -- name: Check slurmctld is active before reconfigure - ansible.builtin.command: systemctl is-active slurmctld +- name: Gather service facts on controller + ansible.builtin.service_facts: delegate_to: slurm_controller - register: slurmctld_active - changed_when: false - failed_when: false + tags: config_rollback + +- name: Set slurmctld state + ansible.builtin.set_fact: + slurmctld_state: "{{ ansible_facts.services['slurmctld.service'].state | default('unknown') }}" tags: config_rollback - name: Fail if slurmctld is not active ansible.builtin.fail: msg: "slurmctld is not active on the controller. Rollback applied on disk, but cannot reconfigure until slurmctld is running. Verify munge and slurmctld services and restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the controller." - when: slurmctld_active.stdout | default('') | trim != 'active' + when: slurmctld_state != 'running' tags: config_rollback - name: Run scontrol reconfigure From 901fe1dcd51df17ee7bf1d5018c28b44b6eb3a78 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 16:11:38 +0000 Subject: [PATCH 066/172] lint long line --- utils/roles/slurm_config_rollback/tasks/main.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml index 1d3b23ec68..19282515b3 100644 --- a/utils/roles/slurm_config_rollback/tasks/main.yml +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -409,7 +409,11 @@ - name: Fail if slurmctld is not active ansible.builtin.fail: - msg: "slurmctld is not active on the controller. Rollback applied on disk, but cannot reconfigure until slurmctld is running. Verify munge and slurmctld services and restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the controller." + msg: >- + slurmctld is not active on the controller. Rollback applied on disk, but cannot + reconfigure until slurmctld is running. Verify munge and slurmctld services and + restart slurmctld, then re-run rollback or run 'scontrol reconfigure' on the + controller. when: slurmctld_state != 'running' tags: config_rollback @@ -429,4 +433,7 @@ - name: Fail with rollback guidance ansible.builtin.fail: - msg: "Rollback applied on disk, but scontrol reconfigure failed. Recommended action: rollback to the safety backup created before this rollback (if you chose to create it)." + msg: >- + Rollback applied on disk, but scontrol reconfigure failed. Recommended action: + rollback to the safety backup created before this rollback (if you chose to + create it). From dd63ae0c6c4a06aabb616927f1e4271681cfd910 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 6 Feb 2026 16:26:49 +0000 Subject: [PATCH 067/172] copy module usage --- utils/roles/slurm_config_rollback/tasks/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml index 19282515b3..0a66c096b0 100644 --- a/utils/roles/slurm_config_rollback/tasks/main.yml +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -326,11 +326,11 @@ tags: config_rollback - name: Restore config directories - ansible.posix.synchronize: + ansible.builtin.copy: src: "{{ selected_backup_ctld_root }}/{{ item }}/" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/" - archive: true - recursive: true + remote_src: true + mode: preserve loop: - etc/slurm - etc/munge From 2ddc2c306188cb024d0da3e06c77ddc3a5922a7a Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Sat, 7 Feb 2026 00:25:18 +0530 Subject: [PATCH 068/172] Just Added complex gres conf !!! --- .../common_utils/slurm_conf_utils.py | 63 ++++++++++++++----- .../validation_flows/common_validation.py | 4 +- common/library/modules/slurm_conf.py | 2 +- 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 22b38d7ad3..d152363616 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -13,6 +13,7 @@ # limitations under the License. # These are the slurm options for version - 25.11 +import json import re import os from enum import Enum @@ -67,6 +68,7 @@ class SlurmParserEnum(str, Enum): nodename_options = { + "NodeName": S_P_STRING, "BcastAddr": S_P_STRING, "Boards": S_P_UINT16, "CoreSpecCount": S_P_UINT16, @@ -99,12 +101,14 @@ class SlurmParserEnum(str, Enum): nodeset_options = { + "NodeSet": S_P_STRING, "Feature": S_P_STRING, "Nodes": S_P_STRING } partition_options = { + "Partition": S_P_STRING, "AllocNodes": S_P_CSV, "AllowAccounts": S_P_CSV, "AllowGroups": S_P_CSV, @@ -514,7 +518,7 @@ class SlurmParserEnum(str, Enum): } # From https://github.com/SchedMD/slurm/blob/slurm-s/src/interfaces/gres.c#L101C40-L116C2 -gres_options = { +_gres_options = { "AutoDetect": S_P_STRING, "Count": S_P_STRING, # Number of Gres available "CPUs": S_P_STRING, # CPUs to bind to Gres resource @@ -525,11 +529,26 @@ class SlurmParserEnum(str, Enum): "Link": S_P_STRING, # Communication link IDs "Links": S_P_CSV, # Communication link IDs "MultipleFiles": S_P_CSV, # list of GRES device files - "Type": S_P_STRING, # Gres type (e.g. model name) - "Name": S_P_ARRAY, # Gres name - "NodeName": S_P_ARRAY + "Type": S_P_STRING } +gres_options = _gres_options.copy() +gres_options.update({ + "Name": S_P_ARRAY, + "NodeName": S_P_ARRAY +}) + +gres_nodename_options = _gres_options.copy() +gres_nodename_options.update({ + "NodeName": S_P_STRING, + "Name": S_P_STRING +}) + +gres_name_options = _gres_options.copy() +gres_name_options.update({ + "Name": S_P_STRING +}) + all_confs = { "slurm": slurm_options, "slurmdbd": slurmdbd_options, @@ -538,19 +557,23 @@ class SlurmParserEnum(str, Enum): "gres": gres_options, # TOD: GRES can have different combinations, NodeName and Name # https://slurm.schedmd.com/gres.conf.html#SECTION_EXAMPLES - "PartitionName": partition_options, - "NodeName": nodename_options, - "DownNodes": downnodes_options, - "NodeSet": nodeset_options + "slurm->PartitionName": partition_options, + "slurm->NodeName": nodename_options, + "slurm->DownNodes": downnodes_options, + "slurm->NodeSet": nodeset_options, + "gres->Name": gres_name_options, + "gres->NodeName": gres_nodename_options } _HOSTLIST_RE = re.compile( r'^(?P[^\[\]]*)\[(?P[^\[\]]+)\](?P.*)$') -def validate_config_types(conf_dict, conf_name): +def validate_config_types(conf_dict, conf_name, module): """Validate configuration keys and value types based on SlurmParserEnum.""" current_conf = all_confs.get(conf_name, {}) - invalid_keys = set(conf_dict.keys()).difference(set(current_conf.keys())) + module.warn(f"current_conf: {current_conf}") + module.warn(f"conf_dict: {conf_dict}") + invalid_keys = list(set(conf_dict.keys()).difference(set(current_conf.keys()))) type_errors = [] for key, value in conf_dict.items(): @@ -593,24 +616,30 @@ def validate_config_types(conf_dict, conf_name): elif expected_type == "array": if not isinstance(value, list): error = f"Expected array (list), got {type(value).__name__}" - elif value and not all(isinstance(item, dict) for item in value): - error = "Expected array of dicts, got mixed types" - + elif value: + if not all(isinstance(item, dict) for item in value): + error = "Expected array of dicts, got mixed types" + else: + # Recursively validate each dict item in the array + for item in value: + item_result = validate_config_types(item, f"{conf_name}->{key}", module) + module.warn(f"item: {item}") + module.warn(json.dumps(item_result)) + type_errors.extend(item_result['type_errors']) + invalid_keys.extend(item_result['invalid_keys']) elif expected_type == "object": if not isinstance(value, (dict, object)): error = f"Expected object, got {type(value).__name__}" if error: - type_errors.append({ + type_errors.append({ # format for error message in input validator "error_key": "omnia_config.yml", "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'", "error_value": "slurm_cluster->config_sources" }) - return { 'invalid_keys': list(invalid_keys), - 'type_errors': type_errors, - 'valid': len(invalid_keys) == 0 and len(type_errors) == 0 + 'type_errors': type_errors } def get_invalid_keys(conf_dict, conf_name): diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 8c850effbf..af87ce7339 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1076,10 +1076,8 @@ def validate_omnia_config( conf_dict = parse_slurm_conf(v, k, False) else: conf_dict = v - - # Validate config types once for both cases if conf_dict: - validation_result = validate_config_types(conf_dict, k) + validation_result = validate_config_types(conf_dict, k, module) if validation_result['type_errors']: errors.extend(validation_result['type_errors']) if validation_result['invalid_keys']: diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py index a782cb1f79..4866077242 100644 --- a/common/library/modules/slurm_conf.py +++ b/common/library/modules/slurm_conf.py @@ -173,7 +173,7 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name): existing_dict = merged_dict.get(ky, {}) inner_dict = existing_dict.get(item.get(ky), {}) # Get the sub-options for this array type (e.g., nodename_options, partition_options) - sub_options = all_confs.get(ky, {}) + sub_options = all_confs.get(f"{conf_name}->{ky}", {}) # Merge item into inner_dict, handling CSV fields specially for k, v in item.items(): if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict: From 2affffc05e5b7c3f636f3443a902b967a546d59a Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Sat, 7 Feb 2026 00:27:28 +0530 Subject: [PATCH 069/172] Removed debugging lines module --- .../input_validation/common_utils/slurm_conf_utils.py | 11 ----------- .../validation_flows/common_validation.py | 1 - 2 files changed, 12 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index d152363616..3f6e2fac30 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -571,8 +571,6 @@ class SlurmParserEnum(str, Enum): def validate_config_types(conf_dict, conf_name, module): """Validate configuration keys and value types based on SlurmParserEnum.""" current_conf = all_confs.get(conf_name, {}) - module.warn(f"current_conf: {current_conf}") - module.warn(f"conf_dict: {conf_dict}") invalid_keys = list(set(conf_dict.keys()).difference(set(current_conf.keys()))) type_errors = [] @@ -623,8 +621,6 @@ def validate_config_types(conf_dict, conf_name, module): # Recursively validate each dict item in the array for item in value: item_result = validate_config_types(item, f"{conf_name}->{key}", module) - module.warn(f"item: {item}") - module.warn(json.dumps(item_result)) type_errors.extend(item_result['type_errors']) invalid_keys.extend(item_result['invalid_keys']) elif expected_type == "object": @@ -642,13 +638,6 @@ def validate_config_types(conf_dict, conf_name, module): 'type_errors': type_errors } -def get_invalid_keys(conf_dict, conf_name): - """Get invalid configuration keys by comparing against expected keys.""" - current_conf = all_confs.get(conf_name, {}) - # get difference between conf_dict keys and current_conf keys - diff = set(conf_dict.keys()).difference(set(current_conf.keys())) - return list(diff) - def parse_slurm_conf(file_path, conf_name, validate): """Parses the slurm.conf file and returns it as a dictionary.""" current_conf = all_confs.get(conf_name, {}) diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index af87ce7339..ae4e693b9e 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -42,7 +42,6 @@ ) from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import ( parse_slurm_conf, - get_invalid_keys, validate_config_types ) From 07f5a888ad97fca4a8f5eb4e8c689f5e7e61f2a7 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Sat, 7 Feb 2026 20:58:43 +0530 Subject: [PATCH 070/172] pylint fix --- .../common_utils/slurm_conf_utils.py | 72 ++++++++++++------- common/library/modules/slurm_conf.py | 12 ++-- 2 files changed, 54 insertions(+), 30 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 3f6e2fac30..401109640b 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -13,12 +13,12 @@ # limitations under the License. # These are the slurm options for version - 25.11 -import json import re import os from enum import Enum from collections import OrderedDict + class SlurmParserEnum(str, Enum): """Enumeration of Slurm configuration parameter types for parsing and validation.""" @@ -62,6 +62,7 @@ class SlurmParserEnum(str, Enum): downnodes_options = { + "DownNodes": S_P_STRING, "Reason": S_P_STRING, "State": S_P_STRING, } @@ -157,7 +158,8 @@ class SlurmParserEnum(str, Enum): "TRESBillingWeights": S_P_CSV } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/common/read_config.c +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/common/read_config.c slurm_options = { "AccountingStorageBackupHost": S_P_STRING, "AccountingStorageEnforce": S_P_CSV, @@ -402,7 +404,8 @@ class SlurmParserEnum(str, Enum): "SlurmctldHost": S_P_LIST } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/slurmdbd/read_config.c +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/slurmdbd/read_config.c slurmdbd_options = { "AllowNoDefAcct": S_P_BOOLEAN, "AllResourcesAbsolute": S_P_BOOLEAN, @@ -473,7 +476,8 @@ class SlurmParserEnum(str, Enum): "TrackSlurmctldDown": S_P_BOOLEAN } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/interfaces/cgroup.c#L332 +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/interfaces/cgroup.c#L332 cgroup_options = { "CgroupAutomount": S_P_BOOLEAN, "CgroupMountpoint": S_P_STRING, @@ -500,7 +504,8 @@ class SlurmParserEnum(str, Enum): "SystemdTimeout": S_P_UINT64 } -# From https://github.com/SchedMD/slurm/blob/slurm-/src/plugins/mpi/pmix/mpi_pmix.c#L83 +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/plugins/mpi/pmix/mpi_pmix.c#L83 mpi_options = { "PMIxCliTmpDirBase": S_P_STRING, "PMIxCollFence": S_P_STRING, @@ -517,7 +522,8 @@ class SlurmParserEnum(str, Enum): "PMIxTlsUCX": S_P_CSV } -# From https://github.com/SchedMD/slurm/blob/slurm-s/src/interfaces/gres.c#L101C40-L116C2 +# From +# https://github.com/SchedMD/slurm/blob/slurm-s/src/interfaces/gres.c#L101C40-L116C2 _gres_options = { "AutoDetect": S_P_STRING, "Count": S_P_STRING, # Number of Gres available @@ -568,16 +574,18 @@ class SlurmParserEnum(str, Enum): _HOSTLIST_RE = re.compile( r'^(?P[^\[\]]*)\[(?P[^\[\]]+)\](?P.*)$') + def validate_config_types(conf_dict, conf_name, module): """Validate configuration keys and value types based on SlurmParserEnum.""" current_conf = all_confs.get(conf_name, {}) - invalid_keys = list(set(conf_dict.keys()).difference(set(current_conf.keys()))) + invalid_keys = list( + set(conf_dict.keys()).difference(set(current_conf.keys()))) type_errors = [] - + for key, value in conf_dict.items(): if key in current_conf: expected_type_enum = current_conf[key] - expected_type = expected_type_enum.value + expected_type = expected_type_enum.value error = None if expected_type == "int": @@ -586,41 +594,44 @@ def validate_config_types(conf_dict, conf_name, module): int(str(value)) except (ValueError, TypeError): error = f"Expected integer, got {type(value).__name__}" - + elif expected_type == "float": if not isinstance(value, (int, float)): try: float(str(value)) except (ValueError, TypeError): error = f"Expected float, got {type(value).__name__}" - + elif expected_type == "bool": if not isinstance(value, bool): - if str(value).lower() not in ['yes', 'no', 'true', 'false', '0', '1']: + if str(value).lower() not in [ + 'yes', 'no', 'true', 'false', '0', '1']: error = f"Expected boolean, got {type(value).__name__}" - + elif expected_type == "str": if not isinstance(value, str): error = f"Expected string, got {type(value).__name__}" - + elif expected_type == "csv": if not isinstance(value, str): error = f"Expected CSV string, got {type(value).__name__}" - + elif expected_type == "list": if not isinstance(value, list): error = f"Expected list, got {type(value).__name__}" - + elif expected_type == "array": if not isinstance(value, list): - error = f"Expected array (list), got {type(value).__name__}" + error = f"Expected array (list), got { + type(value).__name__}" elif value: if not all(isinstance(item, dict) for item in value): error = "Expected array of dicts, got mixed types" else: # Recursively validate each dict item in the array for item in value: - item_result = validate_config_types(item, f"{conf_name}->{key}", module) + item_result = validate_config_types( + item, f"{conf_name}->{key}", module) type_errors.extend(item_result['type_errors']) invalid_keys.extend(item_result['invalid_keys']) elif expected_type == "object": @@ -628,16 +639,17 @@ def validate_config_types(conf_dict, conf_name, module): error = f"Expected object, got {type(value).__name__}" if error: - type_errors.append({ # format for error message in input validator + type_errors.append({ # format for error message in input validator "error_key": "omnia_config.yml", "error_msg": f"{conf_name}.conf: '{key}': {error} -> '{value}'", "error_value": "slurm_cluster->config_sources" - }) + }) return { 'invalid_keys': list(invalid_keys), 'type_errors': type_errors } + def parse_slurm_conf(file_path, conf_name, validate): """Parses the slurm.conf file and returns it as a dictionary.""" current_conf = all_confs.get(conf_name, {}) @@ -661,21 +673,31 @@ def parse_slurm_conf(file_path, conf_name, validate): tmp_dict[key.strip()] = value.strip() skey = list(tmp_dict.keys())[0] if validate and skey not in current_conf: - raise ValueError(f"Invalid key while parsing {file_path}: {skey}") + raise ValueError( + f"Invalid key while parsing {file_path}: {skey}") if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY: slurm_dict[list(tmp_dict.keys())[0]] = list( slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict] elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV: - existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()] - new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()] - slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values))) + existing_values = [ + v.strip() for v in slurm_dict.get( + skey, "").split(',') if v.strip()] + new_values = [v.strip() + for v in tmp_dict[skey].split(',') if v.strip()] + slurm_dict[skey] = ",".join( + list( + dict.fromkeys( + existing_values + + new_values))) elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST: - slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values()) + slurm_dict[skey] = list(slurm_dict.get( + skey, [])) + list(tmp_dict.values()) else: slurm_dict.update(tmp_dict) return slurm_dict + def expand_hostlist(expr): """ Expand simple Slurm-style hostlist expressions, e.g.: diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py index 4866077242..dcacbcae2f 100644 --- a/common/library/modules/slurm_conf.py +++ b/common/library/modules/slurm_conf.py @@ -161,7 +161,7 @@ def read_dict2ini(conf_dict): return data -def slurm_conf_dict_merge(conf_dict_list, conf_name): +def slurm_conf_dict_merge(conf_dict_list, conf_name, replace): """Merge multiple Slurm configuration dictionaries into a single dictionary.""" merged_dict = OrderedDict() current_conf = all_confs.get(conf_name, {}) @@ -176,7 +176,7 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name): sub_options = all_confs.get(f"{conf_name}->{ky}", {}) # Merge item into inner_dict, handling CSV fields specially for k, v in item.items(): - if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict: + if sub_options.get(k) == SlurmParserEnum.S_P_CSV and k in inner_dict and not replace: # Merge CSV values existing_values = [val.strip() for val in inner_dict[k].split(',') if val.strip()] new_values = [val.strip() for val in v.split(',') if val.strip()] @@ -193,7 +193,7 @@ def slurm_conf_dict_merge(conf_dict_list, conf_name): else: new_items = [vl] merged_dict[ky] = list(dict.fromkeys(existing_list + new_items)) - elif current_conf.get(ky) == SlurmParserEnum.S_P_CSV: + elif current_conf.get(ky) == SlurmParserEnum.S_P_CSV and not replace: existing_values = [v.strip() for v in merged_dict.get(ky, "").split(',') if v.strip()] new_values = [v.strip() for v in vl.split(',') if v.strip()] merged_dict[ky] = ",".join(list(dict.fromkeys(existing_values + new_values))) @@ -215,7 +215,8 @@ def run_module(): "conf_map": {'type': 'dict', 'default': {}}, "conf_sources": {'type': 'list', 'elements': 'raw', 'default': []}, "conf_name": {'type': 'str', 'default': 'slurm'}, - "validate": {'type': 'bool', 'default': False} + "validate": {'type': 'bool', 'default': False}, + "replace": {'type': 'bool', 'default': False} } result = {"changed": False, "failed": False} @@ -230,6 +231,7 @@ def run_module(): try: conf_name = module.params['conf_name'] validate = module.params['validate'] + replace = module.params['replace'] # Parse the slurm.conf file if module.params['op'] == 'parse': s_dict = parse_slurm_conf(module.params['path'], conf_name, validate) @@ -249,7 +251,7 @@ def run_module(): conf_dict_list.append(OrderedDict(s_dict)) else: raise TypeError(f"Invalid type for conf_source: {type(conf_source)}") - merged_dict = slurm_conf_dict_merge(conf_dict_list, conf_name) + merged_dict = slurm_conf_dict_merge(conf_dict_list, conf_name, replace) result['conf_dict'] = merged_dict result['ini_lines'] = read_dict2ini(merged_dict) except (FileNotFoundError, ValueError, TypeError, AttributeError) as e: From 3a7481e8ead97e8a6d33745a5c01ce35b59790fe Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Sat, 7 Feb 2026 21:12:44 +0530 Subject: [PATCH 071/172] Validation keys now in module utils, now removed --- .../roles/slurm_config/defaults/main.yml | 226 +----------------- 1 file changed, 1 insertion(+), 225 deletions(-) diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index 03ea48760c..a8fbc8e9c8 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -25,231 +25,7 @@ default_corespersocket: 1 share_prefix: "/" conf_path_items: {} conf_dict_items: {} -# This validates just the keys and not the values, as native support for this schema is not available from slurm -__conf_keys: - slurm: -# updated from version 22.08 using cmd -> scontrol show config - - AccountingStorageBackupHost - - AccountingStorageEnforce - - AccountingStorageHost - - AccountingStorageExternalHost - - AccountingStorageParameters - - AccountingStoragePort - - AccountingStorageTRES - - AccountingStorageType - - AccountingStorageUser - - AccountingStoreFlags - - AcctGatherEnergyType - - AcctGatherFilesystemType - - AcctGatherInterconnectType - - AcctGatherNodeFreq - - AcctGatherProfileType - - AllowSpecResourcesUsage - - AuthAltTypes - - AuthAltParameters - - AuthInfo - - AuthType - - BatchStartTimeout - - BcastExclude - - BcastParameters - - BurstBufferType - - CliFilterPlugins - # - ClusterName # This will be set from the input "omnia_config.yml" - - CommunicationParameters - - CompleteWait - - CoreSpecPlugin - - CpuFreqDef - - CpuFreqGovernors - - CredType - - DebugFlags - - DefMemPerNode - - DependencyParameters - - DisableRootJobs - - EioTimeout - - EnforcePartLimits - - Epilog - - EpilogMsgTime - - EpilogSlurmctld - - ExtSensorsType - - ExtSensorsFreq - - FederationParameters - - FirstJobId - - GetEnvTimeout - - GresTypes - - GpuFreqDef - - GroupUpdateForce - - GroupUpdateTime - - HealthCheckInterval - - HealthCheckNodeState - - HealthCheckProgram - - InactiveLimit - - InteractiveStepOptions - - JobAcctGatherFrequency - - JobAcctGatherType - - JobAcctGatherParams - - JobCompHost - - JobCompLoc - - JobCompPort - - JobCompType - - JobCompUser - - JobContainerType - - JobCredentialPrivateKey - - JobCredentialPublicCertificate - - JobDefaults - - JobFileAppend - - JobRequeue - - JobSubmitPlugins - - KillOnBadExit - - KillWait - - LaunchParameters - - LaunchType - - Licenses - - LogTimeFormat - - MailDomain - - MailProg - - MaxArraySize - - MaxDBDMsgs - - MaxJobCount - - MaxJobId - - MaxMemPerNode - - MaxNodeCount - - MaxStepCount - - MaxTasksPerNode - - MCSPlugin - - MCSParameters - - MessageTimeout - - MinJobAge - - MpiDefault - - MpiParams - - NodeFeaturesPlugins - - OverTimeLimit - - PluginDir - - PlugStackConfig - - PowerParameters - - PowerPlugin - - PreemptType - - PreemptExemptTime - - PrEpParameters - - PrEpPlugins - - PriorityParameters - - PrioritySiteFactorParameters - - PrioritySiteFactorPlugin - - PriorityType - - PrivateData - - ProctrackType - - Prolog - - PrologEpilogTimeout - - PrologSlurmctld - - PrologFlags - - PropagatePrioProcess - - PropagateResourceLimits - - PropagateResourceLimitsExcept - - RebootProgram - - ReconfigFlags - - RequeueExit - - RequeueExitHold - - ResumeFailProgram - - ResumeProgram - - ResumeRate - - ResumeTimeout - - ResvEpilog - - ResvOverRun - - ResvProlog - - ReturnToService - - RoutePlugin - - SchedulerParameters - - SchedulerTimeSlice - - SchedulerType - - ScronParameters - - SelectType - - SelectTypeParameters - - SlurmUser - - SlurmctldAddr - - SlurmctldDebug - - SlurmctldLogFile - - SlurmctldPort - - SlurmctldSyslogDebug - - SlurmctldPrimaryOffProg - - SlurmctldPrimaryOnProg - - SlurmctldTimeout - - SlurmctldParameters - - SlurmdDebug - - SlurmdLogFile - - SlurmdParameters - - SlurmdPidFile - - SlurmdPort - - SlurmdSpoolDir - - SlurmdSyslogDebug - - SlurmdTimeout - - SlurmdUser - - SlurmSchedLogFile - - SlurmSchedLogLevel - - SlurmctldPidFile - - SlurmctldPlugstack - - SrunEpilog - - SrunPortRange - - SrunProlog - - StateSaveLocation - - SuspendExcNodes - - SuspendExcParts - - SuspendProgram - - SuspendRate - - SuspendTime - - SuspendTimeout - - SwitchParameters - - SwitchType - - TaskEpilog - - TaskPlugin - - TaskPluginParam - - TaskProlog - - TCPTimeout - - TmpFS - - TopologyParam - - TopologyPlugin - - TrackWCKey - - TreeWidth - - UsePam - - UnkillableStepProgram - - UnkillableStepTimeout - - VSizeFactor - - WaitTime - - X11Parameters - mpi: - - PMIxCliTmpDirBase - - PMIxCollFence - - PMIxDebug - - PMIxDirectConn - - PMIxDirectConnEarly - - PMIxDirectConnUCX - - PMIxDirectSameArch - - PMIxEnv - - PMIxFenceBarrier - - PMIxNetDevicesUCX - - PMIxShareServerTopology - - PMIxTimeout - - PMIxTlsUCX - cgroup: - - CgroupMountpoint - - CgroupPlugin - - CgroupSlice - - SystemdTimeout - - IgnoreSystemd - - IgnoreSystemdOnFailure - - EnableControllers - - EnableExtraControllers - - AllowedRAMSpace - - AllowedSwapSpace - - ConstrainCores - - ConstrainDevices - - ConstrainRAMSpace - - ConstrainSwapSpace - - MaxRAMPercent - - MaxSwapPercent - - MemorySwappiness - - MinRAMSpace - - SignalChildrenProcesses - slurmdbd: {} - gres: {} + __default_config: cgroup: # CgroupAutomount: true From bef1a76842e6702d0b0e27a913e431f72e694d3c Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 9 Feb 2026 00:31:07 +0530 Subject: [PATCH 072/172] Added all possible confs from the slurm source code with types --- .../common_utils/slurm_conf_utils.py | 239 ++++++++++++++++-- 1 file changed, 212 insertions(+), 27 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 401109640b..20d61afc98 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -61,14 +61,14 @@ class SlurmParserEnum(str, Enum): S_P_LIST = SlurmParserEnum.S_P_LIST -downnodes_options = { +slurm_downnodes_options = { "DownNodes": S_P_STRING, "Reason": S_P_STRING, "State": S_P_STRING, } -nodename_options = { +slurm_nodename_options = { "NodeName": S_P_STRING, "BcastAddr": S_P_STRING, "Boards": S_P_UINT16, @@ -101,14 +101,14 @@ class SlurmParserEnum(str, Enum): } -nodeset_options = { +slurm_nodeset_options = { "NodeSet": S_P_STRING, "Feature": S_P_STRING, "Nodes": S_P_STRING } -partition_options = { +slurm_partitionname_options = { "Partition": S_P_STRING, "AllocNodes": S_P_CSV, "AllowAccounts": S_P_CSV, @@ -504,24 +504,6 @@ class SlurmParserEnum(str, Enum): "SystemdTimeout": S_P_UINT64 } -# From -# https://github.com/SchedMD/slurm/blob/slurm-/src/plugins/mpi/pmix/mpi_pmix.c#L83 -mpi_options = { - "PMIxCliTmpDirBase": S_P_STRING, - "PMIxCollFence": S_P_STRING, - "PMIxDebug": S_P_UINT32, - "PMIxDirectConn": S_P_BOOLEAN, - "PMIxDirectConnEarly": S_P_BOOLEAN, - "PMIxDirectConnUCX": S_P_BOOLEAN, - "PMIxDirectSameArch": S_P_BOOLEAN, - "PMIxEnv": S_P_STRING, - "PMIxFenceBarrier": S_P_BOOLEAN, - "PMIxNetDevicesUCX": S_P_STRING, - "PMIxShareServerTopology": S_P_BOOLEAN, - "PMIxTimeout": S_P_UINT32, - "PMIxTlsUCX": S_P_CSV -} - # From # https://github.com/SchedMD/slurm/blob/slurm-s/src/interfaces/gres.c#L101C40-L116C2 _gres_options = { @@ -555,20 +537,223 @@ class SlurmParserEnum(str, Enum): "Name": S_P_STRING }) +# From +# https://github.com/SchedMD/slurm/blob/slurm-/src/plugins/mpi/pmix/mpi_pmix.c#L83 +mpi_options = { + "PMIxCliTmpDirBase": S_P_STRING, + "PMIxCollFence": S_P_STRING, + "PMIxDebug": S_P_UINT32, + "PMIxDirectConn": S_P_BOOLEAN, + "PMIxDirectConnEarly": S_P_BOOLEAN, + "PMIxDirectConnUCX": S_P_BOOLEAN, + "PMIxDirectSameArch": S_P_BOOLEAN, + "PMIxEnv": S_P_STRING, + "PMIxFenceBarrier": S_P_BOOLEAN, + "PMIxNetDevicesUCX": S_P_STRING, + "PMIxShareServerTopology": S_P_BOOLEAN, + "PMIxTimeout": S_P_UINT32, + "PMIxTlsUCX": S_P_CSV +} + +# src/common/oci_config.c +oci_options = { + "ContainerPath": S_P_STRING, + "CreateEnvFile": S_P_STRING, + "DisableHooks": S_P_STRING, + "EnvExclude": S_P_STRING, + "MountSpoolDir": S_P_STRING, + "RunTimeCreate": S_P_STRING, + "RunTimeDelete": S_P_STRING, + "RunTimeKill": S_P_STRING, + "RunTimeEnvExclude": S_P_STRING, + "RunTimeQuery": S_P_STRING, + "RunTimeRun": S_P_STRING, + "RunTimeStart": S_P_STRING, + "SrunPath": S_P_STRING, + "SrunArgs": S_P_LIST, + "DisableCleanup": S_P_BOOLEAN, + "StdIODebug": S_P_STRING, + "SyslogDebug": S_P_STRING, + "FileDebug": S_P_STRING, + "DebugFlags": S_P_STRING, + "IgnoreFileConfigJson": S_P_BOOLEAN +} + +# From +# src/plugins/acct_gather_*/* +acct_gather_options = { + "EnergyIPMIDriverType": S_P_UINT32, + "EnergyIPMIDisableAutoProbe": S_P_UINT32, + "EnergyIPMIDriverAddress": S_P_UINT32, + "EnergyIPMIRegisterSpacing": S_P_UINT32, + "EnergyIPMIDriverDevice": S_P_STRING, + "EnergyIPMIProtocolVersion": S_P_UINT32, + "EnergyIPMIUsername": S_P_STRING, + "EnergyIPMIPassword": S_P_STRING, + "EnergyIPMIPrivilegeLevel": S_P_UINT32, + "EnergyIPMIAuthenticationType": S_P_UINT32, + "EnergyIPMICipherSuiteId": S_P_UINT32, + "EnergyIPMISessionTimeout": S_P_UINT32, + "EnergyIPMIRetransmissionTimeout": S_P_UINT32, + "EnergyIPMIWorkaroundFlags": S_P_UINT32, + "EnergyIPMIRereadSdrCache": S_P_BOOLEAN, + "EnergyIPMIIgnoreNonInterpretableSensors": S_P_BOOLEAN, + "EnergyIPMIBridgeSensors": S_P_BOOLEAN, + "EnergyIPMIInterpretOemData": S_P_BOOLEAN, + "EnergyIPMISharedSensors": S_P_BOOLEAN, + "EnergyIPMIDiscreteReading": S_P_BOOLEAN, + "EnergyIPMIIgnoreScanningDisabled": S_P_BOOLEAN, + "EnergyIPMIAssumeBmcOwner": S_P_BOOLEAN, + "EnergyIPMIEntitySensorNames": S_P_BOOLEAN, + "EnergyIPMIFrequency": S_P_UINT32, + "EnergyIPMICalcAdjustment": S_P_BOOLEAN, + "EnergyIPMIPowerSensors": S_P_STRING, + "EnergyIPMITimeout": S_P_UINT32, + "EnergyIPMIVariable": S_P_STRING, + "ProfileHDF5Dir": S_P_STRING, + "ProfileHDF5Default": S_P_STRING, + "ProfileInfluxDBDatabase": S_P_STRING, + "ProfileInfluxDBDefault": S_P_STRING, + "ProfileInfluxDBFrequency": S_P_UINT32, + "ProfileInfluxDBHost": S_P_STRING, + "ProfileInfluxDBPass": S_P_STRING, + "ProfileInfluxDBRTPolicy": S_P_STRING, + "ProfileInfluxDBTimeout": S_P_UINT32, + "ProfileInfluxDBUser": S_P_STRING, + "InterconnectOFEDPort": S_P_UINT32, + "InfinibandOFEDPort": S_P_UINT32, + "SysfsInterfaces": S_P_STRING +} + +# src/plugins/burst_buffer/common/burst_buffer_common.c +burst_buffer_options = { + "AllowUsers": S_P_STRING, + "CreateBuffer": S_P_STRING, + "DefaultPool": S_P_STRING, + "DenyUsers": S_P_STRING, + "DestroyBuffer": S_P_STRING, + "Directive": S_P_STRING, + "Flags": S_P_STRING, + "GetSysState": S_P_STRING, + "GetSysStatus": S_P_STRING, + "Granularity": S_P_STRING, + "OtherTimeout": S_P_UINT32, + "PollInterval": S_P_UINT32, + "Pools": S_P_STRING, + "StageInTimeout": S_P_UINT32, + "StageOutTimeout": S_P_UINT32, + "StartStageIn": S_P_STRING, + "StartStageOut": S_P_STRING, + "StopStageIn": S_P_STRING, + "StopStageOut": S_P_STRING, + "ValidateTimeout": S_P_UINT32 +} + +# src/plugins/node_features/helpers/node_features_helpers.c +helpers_options = { + "AllowUserBoot": S_P_STRING, + "BootTime": S_P_UINT32, + "ExecTime": S_P_UINT32, + "Feature": S_P_ARRAY, + "MutuallyExclusive": S_P_LIST, + "NodeName": S_P_ARRAY +} + +helpers_nodename_options = { + "AllowUserBoot": S_P_STRING, + "BootTime": S_P_UINT32, + "ExecTime": S_P_UINT32, + "Feature": S_P_CSV, + "MutuallyExclusive": S_P_LIST +} + +helpers_feature_options = { + "Feature": S_P_CSV, + "Helper": S_P_STRING, + "Flags": S_P_STRING +} + +# src/plugins/namespace/tmpfs/read_jcconf.c +job_container_options = { + "AutoBasePath": S_P_BOOLEAN, + "InitScript": S_P_STRING, + "BasePath": S_P_ARRAY, + "EntireStepInNS": S_P_BOOLEAN, + "NodeName": S_P_ARRAY, + "Shared": S_P_BOOLEAN, + "CloneNSScript": S_P_STRING, + "CloneNSEpilog": S_P_STRING, + "CloneNSScript_Wait": S_P_UINT32, + "CloneNSEpilog_Wait": S_P_UINT32 +} + +job_container_nodename_options = { + "AutoBasePath": S_P_BOOLEAN, + "BasePath": S_P_STRING, + "Dirs": S_P_STRING, + "EntireStepInNS": S_P_BOOLEAN, + "NodeName": S_P_STRING, + "Shared": S_P_BOOLEAN, + "CloneNSScript": S_P_STRING, + "CloneNSEpilog": S_P_STRING, + "CloneNSScript_Wait": S_P_UINT32, + "CloneNSEpilog_Wait": S_P_UINT32 +} + +job_container_basename_options = { + "BasePath": S_P_STRING, + "Dirs": S_P_STRING +} + +# src/plugins/topology/tree/switch_record.c +topology_options = { + "SwitchName": S_P_ARRAY, + "LinkSpeed": S_P_UINT32, + "Nodes": S_P_STRING, + "Switches": S_P_STRING, + "BlockName": S_P_ARRAY, + "BlockSizes": S_P_STRING +} + +topology_switchname_options = { + "SwitchName": S_P_STRING, + "LinkSpeed": S_P_UINT32, + "Nodes": S_P_STRING, + "Switches": S_P_STRING +} + +topology_blockname_options = { + "BlockName": S_P_STRING, + "BlockSizes": S_P_STRING, + "Nodes": S_P_STRING +} + all_confs = { "slurm": slurm_options, "slurmdbd": slurmdbd_options, "cgroup": cgroup_options, "mpi": mpi_options, + "oci": oci_options, + "acct_gather": acct_gather_options, + "burst_buffer": burst_buffer_options, + "helpers": helpers_options, + "job_container": job_container_options, + "topology": topology_options, "gres": gres_options, # TOD: GRES can have different combinations, NodeName and Name # https://slurm.schedmd.com/gres.conf.html#SECTION_EXAMPLES - "slurm->PartitionName": partition_options, - "slurm->NodeName": nodename_options, - "slurm->DownNodes": downnodes_options, - "slurm->NodeSet": nodeset_options, + "slurm->PartitionName": slurm_partitionname_options, + "slurm->NodeName": slurm_nodename_options, + "slurm->DownNodes": slurm_downnodes_options, + "slurm->NodeSet": slurm_nodeset_options, "gres->Name": gres_name_options, - "gres->NodeName": gres_nodename_options + "gres->NodeName": gres_nodename_options, + "job_container->NodeName": job_container_nodename_options, + "job_container->BaseName": job_container_basename_options, + "topology->SwitchName": topology_switchname_options, + "topology->BlockName": topology_blockname_options, + "helpers->NodeName": helpers_nodename_options, + "helpers->Feature": helpers_feature_options } _HOSTLIST_RE = re.compile( From 0db953b7b0af24c617e4fdca53c069060913376a Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 9 Feb 2026 10:00:43 +0530 Subject: [PATCH 073/172] update sfm and ome details --- utils/external_kafka_connect_details.yml | 7 +++--- utils/external_victoria_connect_details.yml | 7 +++--- .../tasks/main.yml | 2 ++ .../tasks/main.yml | 25 ++++++++++++++----- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/utils/external_kafka_connect_details.yml b/utils/external_kafka_connect_details.yml index a51a75aa3f..1f2093e54e 100644 --- a/utils/external_kafka_connect_details.yml +++ b/utils/external_kafka_connect_details.yml @@ -21,10 +21,11 @@ - name: Fail if service_kube_control_plane group is missing or empty ansible.builtin.fail: msg: >- - Inventory must define a non-empty 'service_kube_control_plane' group. - Run with '-i ' and ensure at least one host is in that group. + Inventory must define a 'service_kube_control_plane' group with exactly one host. + Provide either the service kube control plane VIP or one of the service kube control plane node IPs. + Run with '-i ' and ensure exactly one host is in that group. when: - - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0 + - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1 - name: Fetch external Kafka connection details hosts: service_kube_control_plane diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml index ad4ed542df..f955bbbc78 100644 --- a/utils/external_victoria_connect_details.yml +++ b/utils/external_victoria_connect_details.yml @@ -21,10 +21,11 @@ - name: Fail if service_kube_control_plane group is missing or empty ansible.builtin.fail: msg: >- - Inventory must define a non-empty 'service_kube_control_plane' group. - Run with '-i ' and ensure at least one host is in that group. + Inventory must define a 'service_kube_control_plane' group with exactly one host. + Provide either the service kube control plane VIP or one of the service kube control plane node IPs. + Run with '-i ' and ensure exactly one host is in that group. when: - - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) == 0 + - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1 - name: Fetch external Victoria connection details hosts: service_kube_control_plane diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index 0c4d525a82..8964652ce1 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -183,8 +183,10 @@ 'OME note (mTLS):', ' Use ca.crt as the server certificate in OME.', ' Create a client certificate in .pfx format (provide a passphrase when prompted):', + ' cd ' ~ kafka_output_dir, ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', ' Use user.pfx as the client certificate in OME.', + ' If you are using the OME UI from a different system than the OIM host, copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.', '' ] }} diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index c44c145921..63c5301db2 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -182,6 +182,15 @@ else '' }} +- name: Build SFM hosts entry for vmselect + ansible.builtin.set_fact: + victoria_sfm_hosts_entry_vmselect: >- + {{ + 'echo "' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts' + if (vmselect_lb_ip.stdout | trim | length) > 0 + else '' + }} + - name: Set Victoria external port fallbacks ansible.builtin.set_fact: vminsert_port: "8480" @@ -211,8 +220,9 @@ server_crt: "{{ victoria_tls_cert }}" notes: sfm: - vminsert_write_url: "https://{{ vminsert_host }}:{{ vminsert_port }}/insert/0/prometheus/api/v1/write" + vminsert_write_url: "https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write" hosts_entry: "{{ victoria_sfm_hosts_entry }}" + hosts_entry_vmselect: "{{ victoria_sfm_hosts_entry_vmselect }}" - name: Ensure output directory exists ansible.builtin.file: @@ -242,17 +252,20 @@ 'Mode: ' ~ victoria_deployment_mode, '', 'Endpoints:', - ' vminsert write: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write', - ' vmselect query: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/prometheus/api/v1/query', - ' vmselect UI: https://' ~ vmselect_host ~ ':' ~ vmselect_port ~ '/select/0/vmui', + ' vminsert write: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write', + ' vmselect query: https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/prometheus/api/v1/query', + ' vmselect UI: https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/vmui', '', 'TLS:', ' server.crt: ' ~ victoria_tls_cert, '', 'SFM note:', - ' Use vminsert write URL for SFM: https://' ~ vminsert_host ~ ':' ~ vminsert_port ~ '/insert/0/prometheus/api/v1/write', + ' Use vminsert write URL for SFM: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write', + ' Add this entry to /etc/hosts on the SFM server:', + ' ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'vminsert LoadBalancer IP not available; cannot generate /etc/hosts entry.'), ' Add this entry to /etc/hosts on the SFM server:', - ' ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'LoadBalancer IP not available; cannot generate /etc/hosts entry.') + ' ' ~ (victoria_sfm_hosts_entry_vmselect if (victoria_sfm_hosts_entry_vmselect | length) > 0 else 'vmselect LoadBalancer IP not available; cannot generate /etc/hosts entry.'), + ' If you are using the SFM UI from a different system than the OIM host, copy server.crt from the OIM host to that system before selecting/uploading it in the UI.' ] }} delegate_to: localhost From f8bd103bd322b7da99b622fdd353e92a83958501 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 9 Feb 2026 10:08:42 +0530 Subject: [PATCH 074/172] update kafka and victoria lint issues --- .../tasks/main.yml | 3 +- .../tasks/main.yml | 45 ++++++++++++++----- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index 8964652ce1..207d93bfe6 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -186,7 +186,8 @@ ' cd ' ~ kafka_output_dir, ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', ' Use user.pfx as the client certificate in OME.', - ' If you are using the OME UI from a different system than the OIM host, copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.', + ' If you are using the OME UI from a different system than the OIM host,', + ' copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.', '' ] }} diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 63c5301db2..38b0ce3045 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -51,7 +51,7 @@ when: victoria_deployment_mode != 'cluster' - name: Get Victoria pods status - ansible.builtin.shell: >- + ansible.builtin.command: >- kubectl get pods -n {{ victoria_namespace }} -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" -o wide @@ -60,7 +60,7 @@ failed_when: victoria_pods_wide.rc != 0 - name: Get Victoria pods status (json) - ansible.builtin.shell: >- + ansible.builtin.command: >- kubectl get pods -n {{ victoria_namespace }} -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" -o json @@ -191,6 +191,27 @@ else '' }} +- name: Set endpoint urls and SFM note strings + ansible.builtin.set_fact: + victoria_vminsert_write_url: >- + https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write + victoria_vmselect_query_url: >- + https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/prometheus/api/v1/query + victoria_vmselect_ui_url: >- + https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/vmui + victoria_sfm_hosts_entry_vminsert_display: >- + {{ + victoria_sfm_hosts_entry + if (victoria_sfm_hosts_entry | length) > 0 + else 'vminsert LoadBalancer IP not available; cannot generate /etc/hosts entry.' + }} + victoria_sfm_hosts_entry_vmselect_display: >- + {{ + victoria_sfm_hosts_entry_vmselect + if (victoria_sfm_hosts_entry_vmselect | length) > 0 + else 'vmselect LoadBalancer IP not available; cannot generate /etc/hosts entry.' + }} + - name: Set Victoria external port fallbacks ansible.builtin.set_fact: vminsert_port: "8480" @@ -220,7 +241,7 @@ server_crt: "{{ victoria_tls_cert }}" notes: sfm: - vminsert_write_url: "https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write" + vminsert_write_url: "{{ victoria_vminsert_write_url }}" hosts_entry: "{{ victoria_sfm_hosts_entry }}" hosts_entry_vmselect: "{{ victoria_sfm_hosts_entry_vmselect }}" @@ -252,20 +273,20 @@ 'Mode: ' ~ victoria_deployment_mode, '', 'Endpoints:', - ' vminsert write: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write', - ' vmselect query: https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/prometheus/api/v1/query', - ' vmselect UI: https://vmselect.' ~ victoria_namespace ~ '.svc.cluster.local:8481/select/0/vmui', + ' vminsert write: ' ~ victoria_vminsert_write_url, + ' vmselect query: ' ~ victoria_vmselect_query_url, + ' vmselect UI: ' ~ victoria_vmselect_ui_url, '', 'TLS:', ' server.crt: ' ~ victoria_tls_cert, '', 'SFM note:', - ' Use vminsert write URL for SFM: https://vminsert.' ~ victoria_namespace ~ '.svc.cluster.local:8480/insert/0/prometheus/api/v1/write', - ' Add this entry to /etc/hosts on the SFM server:', - ' ' ~ (victoria_sfm_hosts_entry if (victoria_sfm_hosts_entry | length) > 0 else 'vminsert LoadBalancer IP not available; cannot generate /etc/hosts entry.'), - ' Add this entry to /etc/hosts on the SFM server:', - ' ' ~ (victoria_sfm_hosts_entry_vmselect if (victoria_sfm_hosts_entry_vmselect | length) > 0 else 'vmselect LoadBalancer IP not available; cannot generate /etc/hosts entry.'), - ' If you are using the SFM UI from a different system than the OIM host, copy server.crt from the OIM host to that system before selecting/uploading it in the UI.' + ' Use vminsert write URL for SFM: ' ~ victoria_vminsert_write_url, + ' Add these entries to /etc/hosts on the SFM server:', + ' ' ~ victoria_sfm_hosts_entry_vminsert_display, + ' ' ~ victoria_sfm_hosts_entry_vmselect_display, + ' If you are using the SFM UI from a different system than the OIM host,', + ' copy server.crt from the OIM host to that system before selecting/uploading it in the UI.' ] }} delegate_to: localhost From 2e7b3cae1a7554b5a71546117dba1bf1ebe1b2f9 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Mon, 9 Feb 2026 10:30:44 +0530 Subject: [PATCH 075/172] Update copyright for container_repo_utils.py Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- common/library/module_utils/local_repo/container_repo_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py index 3b8eb29662..0a4abb35fb 100644 --- a/common/library/module_utils/local_repo/container_repo_utils.py +++ b/common/library/module_utils/local_repo/container_repo_utils.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From aae7e28ebde2e207e93c97852a7abfd19aebe215 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Mon, 9 Feb 2026 10:48:31 +0530 Subject: [PATCH 076/172] Create test.sh --- test.sh | 1555 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1555 insertions(+) create mode 100644 test.sh diff --git a/test.sh b/test.sh new file mode 100644 index 0000000000..cd1f8e63e7 --- /dev/null +++ b/test.sh @@ -0,0 +1,1555 @@ +#!/bin/bash + +# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# This script is used to generate the Omnia core docker image. +# The image is based on Fedora and uses systemd to start all of the necessary +# services. +# +# This script prompts the user for the Omnia shared path and the root +# password. It then checks if the Omnia shared path exists. +# +# The script checks if the ssh key file exists. If it does not exist, a new ssh + +# Color Definitions +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color +YELLOW='\033[0;33m' +omnia_release=2.1.0.0 + +core_container_status=false +omnia_path="" +hashed_passwd="" +domain_name="" + +is_local_ip() { + local ip_to_check="$1" + + # Get all local IP addresses (excluding loopback) + local local_ips + local_ips=$(hostname -I) + + # Check if the IP matches any local IP + if echo "$local_ips" | grep -qw "$ip_to_check"; then + return 0 # IP is local + else + return 1 # IP is not local + fi +} + +OMNIA_BASE_DIR="/opt/omnia" +OMNIA_INPUT_DIR="/opt/omnia/input" +OMNIA_BACKUPS_DIR="/opt/omnia/backups" +OMNIA_METADATA_DIR="/opt/omnia/.data" +OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" + +update_metadata_upgrade_backup_dir() { + local backup_dir="$1" + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running" + return 1 + fi + + podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$OMNIA_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then + sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE' + else + echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE' + fi + " +} + + + +check_internal_nfs_export() { + nfs_server_ip=$1 + nfs_server_share_path=$2 + + if is_local_ip "$nfs_server_ip"; then + echo "The provided NFS server IP ($nfs_server_ip) belongs to the current system." + else + echo "The provided NFS server IP ($nfs_server_ip) is NOT the current system's IP." + exit 1 + fi + + # Query the remote server for exports + exports=$(showmount -e "$nfs_server_ip" 2>/dev/null) + + if [[ $? -ne 0 ]]; then + echo -e "${RED}ERROR: Unable to contact NFS server at $nfs_server_ip. Ensure NFS and rpcbind are running, and firewall allows access.${NC}" + exit 1 + fi + + # Check if path is in the export list + if echo "$exports" | awk '{print $1}' | grep -Fxq "$nfs_server_share_path"; then + echo -e "${GREEN}Path $nfs_server_share_path is exported by $nfs_server_ip.${NC}" + else + echo -e "${RED}ERROR: Path $nfs_server_share_path is NOT exported by $nfs_server_ip.${NC}" + exit 1 + fi +} + +display_supported_use_cases() { + # Color definitions + BLUE='\033[1;34m' + YELLOW='\033[1;33m' + GREEN='\033[1;32m' + NC='\033[0m' # No Color + + # Introductory Guidance + echo -e "${BLUE} ----------------- Omnia Shared Path Configuration ---------------- ${NC}" + echo -e "${BLUE} Please choose the type of Omnia shared path in Omnia Infrastructure Manager (OIM): ${NC}" + echo -e "${BLUE} It is recommended to use a external NFS share for the Omnia shared path. ${NC}" + echo -e "${BLUE} If you are not using NFS, make sure enough space is available on the disk. ${NC}" + echo -e "${YELLOW} Using a Extrenal NFS share is mandatory for Omnia shared path if you are planning to have high availability in OIM or require K8s service cluster. ${NC}" + echo -e "\nSupported Use Cases:\n" + + # Table content + { + echo -e "Share Option\tType\tDescription\tAdditional Info" + echo -e "${GREEN}NFS\tExternal\tExternal NFS server(outside OIM) created by user\tMust be reachable from OIM and service nodes. Mounts on OIM. Recommended for HA and hierarchical clusters.${NC}" + echo -e "NFS\tInternal\tNFS server created by user in OIM\tUsed only for flat provisioning. No HA or k8s service cluster support. No mount performed." + echo -e "Local\tDisk\tDisk storage in OIM\tUsed only for flat provisioning. No HA or hierarchical support." + } | column -t -s $'\t' +} + + +# This function is responsible for initializing the Omnia core container +# It prompts the user for the Omnia shared path and the root password. +# It checks if the Omnia shared path exists. +setup_omnia_core() { + # Validate the system environment + validate_oim + + # Initialize the container configuration + init_container_config + + # Setup the container + setup_container + + # Post container setup configuration + post_setup_config + + # Start the container + start_container_session +} + + +# This function is responsible for cleaning up the Omnia core container. +# It removes the container and performs the necessary cleanup steps. +cleanup_omnia_core() { + # Block if critical service containers exist + critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + if [ -n "$critical_running" ]; then + echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" + echo "$critical_running" + echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}" + exit 1 + fi + + echo -e "${RED} WARNING: This will remove Omnia core container and all files in Omnia Shared Path.${NC}" + echo -e "${GREEN} You can abort and take backup if you want.${NC}" + read -p " Are you sure you want to continue with the cleanup? (y/n): " confirm + if [ "$confirm" = "n" ] || [ "$confirm" = "N" ]; then + echo -e "${GREEN}Aborting.${NC}" + exit 0 + elif [ "$confirm" = "y" ] || [ "$confirm" = "Y" ]; then + + # Fetch the configuration from the Omnia core container. + fetch_config + + # Remove the container + remove_container + + # Perform the necessary cleanup steps + cleanup_config + fi +} + + +# This function is responsible for cleaning up the Omnia core container configuration. +# It removes the public key from the authorized_keys file. +# It removes the private key. +# It removes the ssh key from the known_hosts file. +# It removes the Omnia core configuration. +# +cleanup_config(){ + + # Set the path to the ssh public key. + ssh_key_file="$HOME/.ssh/oim_rsa.pub" + + # Remove the public key from the authorized_keys file. + if [ -f "$ssh_key_file" ]; then + # Remove the line from the authorized_keys file. + sed -i "\|^$(cat $ssh_key_file)$|d" $HOME/.ssh/authorized_keys + echo -e "${GREEN} Public key has been removed from authorized_keys.${NC}" + else + echo -e "${RED} Public key file not found.${NC}" + fi + + # Remove the SSH key pair. + ssh_key_file="$HOME/.ssh/oim_rsa" + ssh_key_file_pub="${ssh_key_file}.pub" + if [ -f "$ssh_key_file" ] && [ -f "$ssh_key_file_pub" ]; then + rm -f "$ssh_key_file" "$ssh_key_file_pub" + echo -e "${GREEN} SSH key pair have been removed.${NC}" + else + echo -e "${RED} SSH key file not found.${NC}" + fi + + # Remove the ssh key from the known_hosts file. + echo -e "${BLUE} Removing ssh key from known_hosts file.${NC}" + ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 + + + # Remove the host entry from the config file in .ssh folder. + ssh_config_file="$HOME/.ssh/config" + if [ -f "$ssh_config_file" ]; then + sed -i '/Host omnia_core/,+5d' "$ssh_config_file" + echo -e "${GREEN} Host entry has been removed from config file.${NC}" + else + echo -e "${RED} Config file not found.${NC}" + fi + + # Remove the Omnia core configuration. + echo -e "${BLUE} Removing Omnia core configuration.${NC}" + rm -rf $omnia_path/omnia/{hosts,input,log,pulp,provision,pcs,ssh_config,tmp,.data} + + # Unmount the NFS shared path if the share option is NFS. + if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then + umount "$omnia_path" + if [ $? -eq 0 ]; then + echo -e "${GREEN} NFS shared path has been unmounted.${NC}" + else + echo -e "${RED} Failed to unmount NFS shared path.${NC}" + fi + # Remove the entry from /etc/fstab + fstab_file="/etc/fstab" + if [ -f "$fstab_file" ]; then + # Create a backup of the fstab file. + cp "$fstab_file" "$fstab_file.bak" + + # Remove the line from the fstab file. + sed -i "\#$omnia_path#d" "$fstab_file" + if [ $? -ne 0 ]; then + echo -e "${RED} Failed to remove the entry from /etc/fstab.${NC}" + fi + fi + fi + + echo -e "${GREEN} Omnia core configuration has been cleaned up.${NC}" +} + +# This function is responsible for removing the Omnia core container. +# +# It removes the container using the 'podman rm -f' command. +# If the container is removed successfully, it prints a success message. +# Otherwise, it prints an error message. +remove_container() { + # Block if critical service containers exist + critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + if [ -n "$critical_running" ]; then + echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" + echo "$critical_running" + echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}" + exit 1 + fi + + # Remove the container. + echo -e "${BLUE} Removing the Omnia core container.${NC}" + if systemctl stop omnia_core.service; then + echo -e "${GREEN} Omnia core container has been removed.${NC}" + # Remove the systemd generator symlinks. + echo -e "${GREEN} Cleaning up systemd generator symlinks.${NC}" + rm -f /run/systemd/generator/omnia_core.service + rm -f /run/systemd/generator/multi-user.target.wants/omnia_core.service + rm -f /run/systemd/generator/default.target.wants/omnia_core.service + + echo -e "${GREEN} Cleaning up omnia_core.container.${NC}" + rm -f /etc/containers/systemd/omnia_core.container + + # Remove the omnia_core.service file. + rm -f /etc/systemd/system/omnia_core.service + systemctl daemon-reload + systemctl reset-failed omnia_core.service + # check if service is removed + if systemctl status omnia_core.service >/dev/null 2>&1; then + echo -e "${RED} Failed to remove Omnia core service.${NC}" + else + echo -e "${GREEN} Omnia core service has been removed.${NC}" + fi + else + echo -e "${RED} Failed to remove Omnia core container.${NC}" + fi + + # Remove the container image. + # if podman rmi omnia_core; then + # echo -e "${GREEN} Omnia core image has been removed.${NC}" + # else + # echo -e "${RED} Failed to remove Omnia core image.${NC}" + # fi +} + + +# This function is responsible for initializing the Omnia core container. +# +# It prompts the user for the Omnia shared path and the root +# password. It then checks if the Omnia shared path exists. +# +# The function generates the ssh key pair and copies the private +# key to the Omnia shared path. +# +# The function also copies the ssh public key to the +# authorized_keys file. +# +# The function creates the necessary log directories. +init_container_config() { + + share_option="" + # Display the supported use cases + display_supported_use_cases + + # Display the choices for the user + echo -e "${BLUE} Choose the type of Omnia shared path:${NC}" + options=( "NFS (recommended)" "Local" ) + + PS3="Select the option number: " + + select opt in "${options[@]}"; do + case $opt in + "NFS (recommended)") + share_option="NFS" + break + ;; + "Local") + share_option="Local" + break + ;; + *) + echo -e "${RED} Invalid option.${NC}" + continue + esac + done + + case $share_option in + "Local") + # Prompt the user for the Omnia shared path. + echo -e "${BLUE} Please provide Omnia shared path:${NC}" + read -p "Omnia shared path: " omnia_path + + # Check if the Omnia shared path is absolute path and path exists. + if [[ "$omnia_path" != /* ]] || [ ! -d "$omnia_path" ]; then + echo -e "${RED} Omnia shared path is not an absolute path or does not exist! Please re-run omnia.sh --install with valid Omnia shared path.${NC}" + exit 1 + fi + ;; + "NFS") + echo -e "${BLUE} Select NFS type:${NC}" + select nfs_type in "External (Recommended)" "Internal"; do + case $nfs_type in + "External (Recommended)") + echo -e "${BLUE} Please provide the external NFS server IP:${NC}" + read -p "External NFS server IP: " nfs_server_ip + + echo -e "${BLUE} Please provide the external NFS server share path:${NC}" + read -p "External NFS share path: " nfs_server_share_path + + echo -e "${BLUE} Please provide the OIM client share path (mount target):${NC}" + read -p "Omnia shared path: " omnia_path + + # Validate Omnia shared path is absolute + if [[ "$omnia_path" != /* ]]; then + echo -e "${RED}Omnia shared path must be an absolute path.${NC}" + exit 1 + fi + + nfs_type="external" + break + ;; + "Internal") + echo -e "${BLUE} Please provide the OIM server IP:${NC}" + read -p "OIM server IP: " nfs_server_ip + + echo -e "${BLUE} Please provide the OIM server share path:${NC}" + read -p "OIM server share path: " nfs_server_share_path + + echo -e "${BLUE} Checking if the OIM server share path is mounted${NC}" + check_internal_nfs_export "$nfs_server_ip" "$nfs_server_share_path" + + # Note: No mounting performed here + echo -e "${YELLOW}Note: Internal NFS does not support HA OIM or hierarchical cluster. Proceeding...${NC}" + nfs_type="internal" + omnia_path="$nfs_server_share_path" + break + ;; + *) + echo -e "${RED}Invalid option. Please choose 1 or 2.${NC}" + ;; + esac + done + ;; + esac + + + # Prompt the user for the Omnia core root password. + echo -e "${BLUE} Please provide Omnia core root password for accessing container:${NC}" + + read -p " Enter: " -s passwd + + # Prompt the user for the Omnia core root password confirmation. + echo -e "\n${BLUE} Please confirm password:${NC}" + read -s -p " Enter: " cnf_passwd + + # Check if the provided passwords match. + if [ "$passwd" != "$cnf_passwd" ]; then + echo -e "${RED} Invalid Omnia core root password, passwords do not match!${NC}" + exit 1 + fi + + # Check if the password contains any of the invalid characters + invalid_chars='[\\|&;`"><*?!$(){}[\]]' + if [[ "$passwd" =~ $invalid_chars ]]; then + echo -e "${RED} Invalid password, passwords must not contain any of these special characters: [\\|&;\`\"><*?!$(){}[\]]${NC}" + exit 1 + fi + + # Install NFS client package if option NFS is selected + if [[ "$share_option" == "NFS" ]]; then + # Install NFS client package + echo -e "${BLUE} Installing NFS client package.${NC}" + dnf install -y nfs-utils nfs4-acl-tools + + # Create omnia_path directory if it does not exist + echo -e "${BLUE} Creating omnia shared path directory if it does not exist.${NC}" + mkdir -p $omnia_path + + # Mount NFS server share path in Omnia share path + if [[ "$nfs_type" == "external" ]]; then + + if is_local_ip "$nfs_server_ip"; then + echo -e "${RED} Error: NFS server $nfs_server_ip is a local IP.${NC}" + echo -e "${RED} Please provide an external NFS server IP or re-run omnia.sh --install with valid options.${NC}" + exit 1 + fi + + # Validate if NFS server is reachable + echo -e "${BLUE} Validating if NFS server is reachable.${NC}" + ping -c1 -W1 $nfs_server_ip > /dev/null + if [ $? -ne 0 ]; then + echo -e "${RED} NFS server $nfs_server_ip is not reachable.${NC}" + exit 1 + fi + + echo -e "${BLUE} Mounting NFS server share path in Omnia share path.${NC}" + mount -t nfs -o nosuid,rw,sync,hard,intr,timeo=30 "$nfs_server_ip:$nfs_server_share_path" "$omnia_path" + if [[ $? -ne 0 ]]; then + echo -e "${RED} Failed to mount NFS. Please check the IP and path.${NC}" + exit 1 + fi + # Validate if NFS server share path is mounted + echo -e "${BLUE} Validating if NFS server share path is mounted.${NC}" + # strip the trailing slash from nfs_server_share_path + nfs_server_share_path="${nfs_server_share_path%/}" + if grep -qs "$nfs_server_ip:$nfs_server_share_path" /proc/mounts; then + echo -e "${GREEN} NFS server share path is mounted.${NC}" + else + echo -e "${RED} NFS server share path is not mounted. Provide valid NFS server details. ${NC}" + exit 1 + fi + # Add NFS server share to /etc/fstab to mount on startup + echo "$nfs_server_ip:$nfs_server_share_path $omnia_path nfs nosuid,rw,sync,hard,intr" >> /etc/fstab + else + echo -e "${BLUE} Using internal NFS path without mounting.${NC}" + fi + + fi + + hashed_passwd=$(openssl passwd -1 $passwd) + ssh_key_file="/root/.ssh/oim_rsa" + ssh_port=2222 + + # Generate a new ssh key pair. + if [ -f "$ssh_key_file" ]; then + echo -e "\n${BLUE} Skipping generating new ssh key pair.${NC}" + else + echo -e "\n${GREEN} Generating a new ssh key pair.${NC}" + ssh-keygen -t rsa -b 4096 -C "omnia_oim" -q -N '' -f /root/.ssh/oim_rsa + { + echo "Host omnia_core" + echo " Hostname localhost" + echo " Port $ssh_port" + echo " User root" + echo " IdentityFile ~/.ssh/oim_rsa" + echo " IdentitiesOnly yes" + } >> $HOME/.ssh/config + fi + + # Create the ssh configuration directory if it does not exist. + echo -e "${GREEN} Creating the ssh configuration directory if it does not exist.${NC}" + mkdir -p "$omnia_path/omnia/ssh_config/.ssh" + + # Copy the omnia_core ssh config to the shared path. + echo -e "${GREEN} Copying the omnia_core ssh config to the omnia shared path.${NC}" + cp "$HOME/.ssh/config" "$omnia_path/omnia/ssh_config/.ssh/config" + + # Copy the oim_rsa ssh key to the shared path. + echo -e "${GREEN} Copying the oim_rsa ssh key to the omnia shared path.${NC}" + cp "$HOME/.ssh/oim_rsa" "$omnia_path/omnia/ssh_config/.ssh/oim_rsa" + + # Copy the ssh private key to the omnia shared path. + echo -e "${GREEN} Copying the ssh private key to the omnia shared path.${NC}" + cp $ssh_key_file "$omnia_path/omnia/ssh_config/.ssh/id_rsa" + + # Copy the ssh public key to the omnia shared path. + echo -e "${GREEN} Copying the ssh public key to the omnia shared path.${NC}" + cp $ssh_key_file.pub "$omnia_path/omnia/ssh_config/.ssh/id_rsa.pub" + + # Get the ssh public key. + ssh_public_key="$(cat /root/.ssh/oim_rsa.pub)" + + validate_nfs_server + + # Add ssh public key to the authorized_keys. + echo -e "${GREEN} Adding ssh public key to the authorized_keys.${NC}" + if grep -q "$ssh_public_key" $HOME/.ssh/authorized_keys; then + echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys.${NC}" + else + echo "$ssh_public_key" >> $HOME/.ssh/authorized_keys + chmod 600 $HOME/.ssh/authorized_keys + fi + + # Add ssh public key to the authorized_keys in the ssh_config directory. + echo -e "${GREEN} Adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}" + if [ -f "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" ] && grep -q "$ssh_public_key" "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"; then + echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}" + else + echo "$ssh_public_key" >> "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" + chmod 600 "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" + fi + + # Create the log directory if it does not exist. + echo -e "${GREEN} Creating the log directory if it does not exist.${NC}" + mkdir -p "$omnia_path/omnia/log/core/container" + mkdir -p "$omnia_path/omnia/log/core/playbooks" + + # Create the hosts file for cluster in $omnia_path/omnia/hosts + echo -e "${GREEN} Creating the hosts file for cluster.${NC}" + touch "$omnia_path/omnia/hosts" + + # Create the pulp_ha directory if it does not exist. + echo -e "${GREEN} Creating the pulp HA directory if it does not exist.${NC}" + mkdir -p "$omnia_path/omnia/pulp/pulp_ha" +} + + +# This function is responsible for fetching the configuration from the Omnia core. +# It uses podman exec to run a command in the Omnia core container. +# The command retrieves the metadata from the oim_metadata.yml file. +# The metadata is then parsed and the required configuration is extracted. +fetch_config() { + + # Fetch the metadata from the oim_metadata.yml file. + echo -e "${GREEN} Fetching the metadata from the oim_metadata.yml file.${NC}" + core_config=$(podman exec -ti omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml') + + # Split the metadata into separate lines. + IFS=$'\n' read -r -d '' -a config_lines <<<"$core_config" + + # Loop through the lines and extract the required configuration. + for line in "${config_lines[@]}"; do + # Extract the key and value from the line. + key=$(echo "$line" | awk -F ':' '{print $1}') + value=$(echo "$line" | awk -F ':' '{print $2}') + + # Check the key and assign the value to the corresponding variable. + case $key in + oim_shared_path) + # Assign the shared path. + omnia_path=$(echo "$value" | tr -d '[:space:]') + ;; + omnia_core_hashed_passwd) + # Assign the hashed password. + hashed_passwd=$(echo "$value" | tr -d '[:space:]') + ;; + nfs_server_ip) + # Assign the nfs server ip. + nfs_server_ip=$(echo "$value" | tr -d '[:space:]') + ;; + nfs_server_share_path) + # Assign the nfs server share path. + nfs_server_share_path=$(echo "$value" | tr -d '[:space:]') + ;; + omnia_share_option) + # Assign the share option. + share_option=$(echo "$value" | tr -d '[:space:]') + ;; + nfs_type) + # Assign the share option. + nfs_type=$(echo "$value" | tr -d '[:space:]') + ;; + esac + done + # Check if the required configuration is extracted successfully. + if [ -z "$omnia_path" ] || [ -z "$hashed_passwd" ]; then + echo -e "${RED} Failed to fetch data from metadata file.${NC}" + exit 1 + else + echo -e "${GREEN} Successfully fetched data from metadata file.${NC}" + fi +} + +# Validates the OIM (Omnia Infrastructure Manager) by checking if the hostname is +# configured with a domain name, checking if Podman is installed, enabling and +# starting the Podman socket. +validate_oim() { + # Check if the hostname is set + hostname_value=$(hostname) + if [[ -z "$hostname_value" ]]; then + echo -e "${RED}Hostname is not set!${NC}" + exit 1 + fi + + # Check if the hostname is static + static_hostname=$(hostnamectl --static) + current_hostname=$(hostname) + if [[ "$static_hostname" != "$current_hostname" ]]; then + echo -e "${RED}Static Hostname is unset. Current: '$current_hostname', Static: '$static_hostname'${NC}" + echo -e "${RED}Please set the static hostname and try again.${NC}" + echo -e "${BLUE}Command to set hostname: hostnamectl set-hostname ${NC}" + echo -e "${RED}Exiting...${NC}" + exit 1 + fi + + # Check if the hostname is configured with a domain name. + domain_name=$(hostname -d) + if [[ -n "$domain_name" ]]; then + echo -e "${BLUE}Hostname is configured with a domain name: $domain_name${NC}" + else + echo -e "${RED}Invalid hostname, hostname is not configured with a domain name!${NC}" + exit 1 + fi + + # Detect OIM timezone from systemd in a stable, case‑independent way + oim_timezone=$(timedatectl show -p Timezone --value 2>/dev/null) + + # Fallbacks if needed (non‑systemd or old timedatectl) + if [[ -z "$oim_timezone" ]]; then + if [[ -f /etc/timezone ]]; then + # Debian/Ubuntu style + oim_timezone=$(< /etc/timezone) + elif [[ -L /etc/localtime ]]; then + # Derive from /etc/localtime symlink + oim_timezone=$(readlink -f /etc/localtime | sed -n 's|^.*zoneinfo/||p') + fi + fi + + podman --version + + # Capture the exit status + if [ $? -eq 0 ]; then + echo -e "${BLUE} Podman is installed. Version: $(podman --version)${NC}" + else + echo -e "${RED} Podman is not installed.${NC}" + exit 1 + fi + + # Enable the podman socket to start at boot + echo -e "${BLUE} Enabling podman.socket...${NC}" + systemctl enable podman.socket + + # Start the podman socket now + echo -e "${BLUE} Starting podman.socket...${NC}" + systemctl start podman.socket + + # Print a success message after enabling and starting the podman socket + echo -e "${GREEN} Podman socket has been enabled and started.${NC}" +} + +# Checks if the required directories for Omnia are present. +# This function iterates over a list of required directories/files and checks if each one exists. +check_required_directories() { + required_paths=( + "$omnia_path/omnia" + "$omnia_path/omnia/ssh_config/.ssh" + "$omnia_path/omnia/log/core/container" + "$omnia_path/omnia/hosts" + "$omnia_path/omnia/pulp/pulp_ha" + ) + + missing_paths=() + + for path in "${required_paths[@]}"; do + if [ ! -e "$path" ]; then # Checks both files and directories + missing_paths+=("$path") + fi + done + + if [ "${#missing_paths[@]}" -ne 0 ]; then + echo -e "${RED}Error: The following required files or directories are missing:${NC}" + echo -e "${RED}${missing_paths[*]}${NC}" + echo -e "User can not Retain Existing configuration" + echo + echo -e "${YELLOW}Instructions:${NC}" + echo -e "${YELLOW}* Backup any existing files if required${NC}" + echo -e "${YELLOW}* Run ./omnia.sh --install and choose:${NC}" + echo -e "${YELLOW} Options:${NC}" + echo -e "${YELLOW} -> Reinstall the container${NC}" + echo -e "${YELLOW} -> Overwrite and create new configuration${NC}" + exit 1 + fi +} + +# Sets up the Omnia core container. +# This function pulls the Omnia core Podman image and runs the container. +# Creates a Quadlet service for the container and also creates a metadata file. +# It defines the container options and runs the container. +setup_container() { + container_name="omnia_core" + echo "==> Setting up $container_name container" + + # SELinux option handling + selinux_option=":z" + if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then + selinux_option="" + fi + + # Check if RHEL subscription is enabled + subscription_enabled=false + if [ -d "/etc/pki/entitlement" ] && [ "$(ls -A /etc/pki/entitlement/*.pem 2>/dev/null)" ]; then + subscription_enabled=true + fi + + # --- Generate Quadlet container file --- + cat > /etc/containers/systemd/${container_name}.container <> /etc/containers/systemd/${container_name}.container <> /etc/containers/systemd/${container_name}.container <> "$oim_metadata_file" + if [ "$share_option" = "NFS" ]; then + { + echo "nfs_server_ip: $nfs_server_ip" + echo "nfs_server_share_path: $nfs_server_share_path" + echo "nfs_type: $nfs_type" + } >> "$oim_metadata_file" + fi + fi + + # --- Remove old service if exists --- + if systemctl list-unit-files | grep -q "${container_name}.service"; then + systemctl stop ${container_name}.service + systemctl disable ${container_name}.service + rm -f /etc/systemd/system/${container_name}.service + fi + + # --- Reload systemd so Quadlet generates the service --- + systemctl daemon-reexec + systemctl daemon-reload + systemctl start ${container_name}.service + + # --- Start the container via Quadlet --- + echo "==> ${container_name} container deployed and starting via Quadlet" + + # --- Wait for container to be running --- + echo "Waiting for $container_name container to start..." + for i in {1..30}; do + if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then + echo "$container_name container is running." + break + else + sleep 1 + fi + done + + if ! podman ps --format '{{.Names}}' | grep -qw "$container_name"; then + echo -e "${RED}Error: $container_name container failed to start.${NC}" + rm -rf "$OMNIA_METADATA_FILE" + exit 1 + fi + + systemctl start firewalld + systemctl enable firewalld + firewall-cmd --permanent --zone=public --add-port=2222/tcp + firewall-cmd --reload +} + +# This function sets up the configuration for the Omnia core. +# post_setup_config is a function that sets up the configuration for the Omnia core. +# It creates the necessary directories and files, copies input files from the Omnia container, +# and creates the oim_metadata.yml file. +post_setup_config() { + + # Create the ansible tmp directory if it does not exist. + mkdir -p "$omnia_path/omnia/tmp/.ansible/tmp" + chmod 757 "$omnia_path/omnia/tmp/.ansible/tmp" + # Create the input directory if it does not exist. + echo -e "${GREEN} Creating the input directory if it does not exist.${NC}" + mkdir -p "$OMNIA_INPUT_DIR/" + + # Create the default.yml file if it does not exist. + # This file contains the name of the project. + if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then + echo -e "${BLUE} Creating default.yml file.${NC}" + { + echo "# This file defines the project name." + echo "# The name of the project should be set in a directory under input." + echo "project_name: project_default" + } >> "$OMNIA_INPUT_DIR/default.yml" + fi + + # Copy input files from /omnia to /opt/omnia/project_default/ inside omnia_core container + podman exec -u root omnia_core bash -c "cd /omnia && git pull" + echo -e "${BLUE} Moving input files from /omnia dir to project_default folder.${NC}" + podman exec -u root omnia_core bash -c " + mkdir -p /opt/omnia/input/project_default + cp -r /omnia/input/* /opt/omnia/input/project_default + rm -rf /omnia/input + rm -rf /omnia/omnia.sh" + + init_ssh_config +} + +validate_nfs_server() { + + # Validate NFS server permission + if [ "$share_option" = "NFS" ]; then + # Create a temporary file inside $omnia_path + temp_file="$omnia_path/temp_file" + touch "$temp_file" + # Check if the file can be chown to root + if chown root:root "$temp_file"; then + rm "$temp_file" + else + echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration." + exit 1 + fi + if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then + echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration." + exit 1 + fi + fi + +} + +init_ssh_config() { + touch $HOME/.ssh/known_hosts + # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host + ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 # Remove existing entry if it exists + ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts # Scan and add the new key +} + +start_container_session() { + + echo -e "${GREEN} + ------------------------------------------------------------------------------------------------------------------------------------------ + Omnia Core container running successfully. + + Entering the container from Omnia Infrastructure Manager(OIM): + Through podman: + # podman exec -it -u root omnia_core bash + + Direct SSH: + # ssh omnia_core + + You are now in the Omnia environment. + + The following are the main directories available in the Omnia core container: + + - The shared directory, which is mapped to $omnia_path in OIM: /opt/omnia + - The input directory: /opt/omnia/input + - The Omnia source code directory: /omnia + - The Omnia playbooks logs directory: /opt/omnia/log/core/playbooks + + It's important to note: + - Files placed in the shared directory should not be manually deleted. + - Use the playbook /omnia/utils/oim_cleanup.yml to safely remove the shared directory and Omnia containers (except the core container). + - If you need to delete the core container, please run the omnia.sh script with --uninstall option. + - If you need to redeploy the core container with new input configs, please rerun the omnia.sh script with --install option. + - Provide any file paths (ISO, mapping files, etc.) that are mentioned in input files in the /opt/omnia directory. + - The domain name that will be used for Omnia is $domain_name, if you wish to change the domain name please cleanup Omnia, + change the Omnia Infrastructure Manager's domain name and rerun omnia.sh script with --install option. + + -------------------------------------------------------------------------------------------------------------------------------------------------- + ${NC}" + + # Entering Omnia-core container + ssh omnia_core +} + +show_help() { + echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]" + echo " -i, --install Install and start the Omnia core container" + echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" + echo " --upgrade Upgrade the Omnia core container from image tag 1.0 to 1.1" + echo " -v, --version Display Omnia version information" + echo " -h, --help More information about usage" +} + +install_omnia_core() { + local omnia_core_tag="1.1" + local omnia_core_registry="" + + # Check if local omnia_core:1.1 exists + if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then + echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" + # Check if latest exists for backward compatibility + elif podman inspect omnia_core:latest >/dev/null 2>&1; then + echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}" + # Tag it as 1.1 for consistency + podman tag omnia_core:latest omnia_core:${omnia_core_tag} + else + echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}" + echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}" + echo "" + echo -e "${YELLOW}One way to build the image locally:${NC}" + echo -e "1. Clone the Omnia Artifactory repository:" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container" + echo -e "2. Navigate to the repository directory:" + echo -e " cd omnia-artifactory" + echo -e "3. Build the core image locally (loads into local Podman by default):" + echo -e " ./build_images.sh core omnia_branch=" + echo "" + echo -e "${YELLOW}Then re-run:${NC}" + echo -e " ./omnia.sh --install" + exit 1 + fi + + # Check if any other containers with 'omnia' in their name are running + other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core') + + # If there are any, exit + if [ -n "$other_containers" ]; then + echo -e "${RED} Failed to intiatiate omnia_core container cleanup. There are other omnia container running.${NC}" + echo -e "${GREEN} Execute oim_cleanup.yml first to cleanup all containers.${NC}" + ssh omnia_core + exit 1 + fi + + # Check if the omnia_core container is already running + running_containers=$(podman ps -a --format '{{.Names}} {{.State}}' | grep -E 'omnia_core') + + # If yes, set the variable to true + if [ -n "$running_containers" ]; then + core_container_status=true + fi + + # If core container is running + if [ "$core_container_status" = true ]; then + if [ -n "$(echo "$running_containers" | grep -E 'running')" ]; then + echo -e "${GREEN} Omnia core container is already running.${NC}" + echo -e "${GREEN} Do you want to:${NC}" + PS3="Select the option number: " + + select opt in "Enter omnia_core container" "Reinstall the container" "Exit"; do + case $opt in + "Enter omnia_core container") + choice=1 + break + ;; + "Reinstall the container") + choice=2 + break + ;; + "Exit") + echo "Exiting the script." + exit 0 + ;; + *) + echo "Invalid choice. Please try again." + continue + ;; + esac + done + + # If the user wants to enter omnia_core container + if [ "$choice" = "1" ]; then + start_container_session + fi + # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function + if [ "$choice" = "2" ]; then + # Block if critical service containers exist + critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + if [ -n "$critical_running" ]; then + echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" + echo "$critical_running" + echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}" + exit 1 + fi + echo -e "${GREEN} What configuration do you want to use for reinstallation:${NC}" + + PS3="Select the option number: " + + select opt in "Retain Existing configuration" "Overwrite and create new configuration" "Exit"; do + case $opt in + "Retain Existing configuration") + choice=1 + break + ;; + "Overwrite and create new configuration") + choice=2 + break + ;; + "Exit") + echo "Exiting the script." + exit 0 + ;; + *) + echo "Invalid choice. Please try again." + continue + ;; + esac + done + + # If the user wants to retain existing configuration, call the remove_container function + if [ "$choice" = "1" ]; then + fetch_config + check_required_directories + remove_container + setup_container + init_ssh_config + start_container_session + # If the user wants to overwrite and create new configuration, call the cleanup_omnia_core function + elif [ "$choice" = "2" ]; then + cleanup_omnia_core + setup_omnia_core + fi + fi + else + # If omnia_core container exists and is not running call the remove_container function + + echo -e "${RED} The Omnia Core container is present but not in running state.${NC}" + echo -e "${GREEN} Only the core container can be cleanup can be performed.${NC}" + echo -e "${GREEN} Container Configurations in the shared directory will not be cleaned up.${NC}" + echo -e "${GREEN} Do you want to perform cleanup:${NC}" + echo -e "${GREEN} 1. Yes.${NC}" + echo -e "${GREEN} 2. No. ${NC}" + read -p " Enter your choice (1 or 2): " choice + if [ "$choice" = "1" ]; then + remove_container + elif [ "$choice" = "2" ]; then + exit + fi + fi + + # If core container is not present + else + + # Start the container setup + echo -e "${GREEN}Starting Omnia core container setup.${NC}" + setup_omnia_core + fi +} + +# Check if Omnia core container is running +check_container_status() { + # Check if the Omnia core container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo -e "${RED}ERROR: Omnia core container is not running.${NC}" + exit 1 + fi +} + +# Function to display version information +display_version() { + # Check if metadata file exists and Omnia core container is running + check_container_status + + # Fetch the metadata from the oim_metadata.yml file in the container + echo -e "${GREEN} Fetching metadata from omnia_core container...${NC}" + core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml') + + # Extract Omnia version from metadata file + omnia_version=$(echo "$core_config" | grep "omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r') + + # Display version information + echo "Omnia version: $omnia_version" + + # Return exit code 0 on success + exit 0 +} + +phase1_validate() { + local current_image + local core_config + local previous_omnia_version + local shared_path + + echo "[INFO] [ORCHESTRATOR] Phase 1: Pre-Upgrade Validation" + + if [ "$(id -u)" -ne 0 ]; then + if ! sudo -n true >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: run as root or configure passwordless sudo" + return 1 + fi + fi + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running" + return 1 + fi + + core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml' 2>/dev/null) + if [ -z "$core_config" ]; then + echo "[ERROR] [ORCHESTRATOR] Unable to read oim_metadata.yml from omnia_core container" + return 1 + fi + + previous_omnia_version=$(echo "$core_config" | grep "^omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r') + if [ -z "$previous_omnia_version" ]; then + echo "[ERROR] [ORCHESTRATOR] omnia_version not found in oim_metadata.yml" + return 1 + fi + + if [ "$previous_omnia_version" != "2.0.0.0" ]; then + echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version" + return 1 + fi + + shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -z "$shared_path" ]; then + echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml" + return 1 + fi + + omnia_path="$shared_path" + + if [ ! -d "$omnia_path" ]; then + echo "[ERROR] [ORCHESTRATOR] Shared path from metadata does not exist on host: $omnia_path" + return 1 + fi + + if [ ! -w "$omnia_path" ]; then + echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on shared path: $omnia_path" + return 1 + fi + + current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null) + if [ -z "$current_image" ]; then + echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image" + return 1 + fi + + if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then + echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)" + + if [ ! -d "$OMNIA_BASE_DIR" ]; then + echo "[ERROR] [ORCHESTRATOR] Mount/path invalid: expected directory not found: $OMNIA_BASE_DIR" + echo "[ERROR] [ORCHESTRATOR] Fix: ensure /opt/omnia exists and is mounted (if using external mount)" + return 1 + fi + + if [ ! -w "$OMNIA_BASE_DIR" ]; then + echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on $OMNIA_BASE_DIR" + echo "[ERROR] [ORCHESTRATOR] Fix: run as root or fix permissions on /opt/omnia" + return 1 + fi + + if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" + echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed" + return 0 +} + +phase2_approval() { + local backup_base default_backup_dir + + echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate" + echo "============================================" + echo "OMNIA UPGRADE SUMMARY" + echo "============================================" + echo "Current Container Tag: 1.0" + echo "Target Container Tag: 1.1" + echo "Current Omnia Release: 2.0.0.0" + echo "Target Omnia Release: 2.1.0.0" + echo "New Features:" + echo " - Add and remove node for slurm cluster" + echo " - Additional Package Installation" + echo "============================================" + + default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade" + backup_base="$default_backup_dir" + + echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base" + + if ! update_metadata_upgrade_backup_dir "$backup_base"; then + echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata" + return 1 + fi + + read -p "Proceed with upgrade? (y/N): " confirm + if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then + echo "[INFO] [ORCHESTRATOR] Upgrade cancelled by user" + return 1 + fi + + OMNIA_UPGRADE_BACKUP_PATH="$backup_base" + export OMNIA_UPGRADE_BACKUP_PATH + + echo "[INFO] [ORCHESTRATOR] Phase 2: Approval granted" + return 0 +} + +generate_backup_manifest() { + local backup_path="$1" + local manifest_file="$backup_path/manifest.txt" + + { + echo "backup_version: 1.0" + echo "timestamp: $(date -Iseconds)" + echo "source_container_tag: 1.0" + echo "target_container_tag: 1.1" + echo "source_omnia_release: 2.0.x" + echo "target_omnia_release: 2.1.0.0" + echo "hostname: $(hostname)" + echo "" + echo "files:" + find "$backup_path" -type f ! -name "manifest.txt" -exec echo " - {}" \; + } > "$manifest_file" +} + +verify_backup_integrity() { + local backup_path="$1" + + [ -d "$backup_path" ] || return 1 + [ -d "$backup_path/input" ] || return 1 + [ -d "$backup_path/metadata" ] || return 1 + [ -d "$backup_path/configs" ] || return 1 + [ -f "$backup_path/metadata/oim_metadata.yml" ] || return 1 + [ -f "$backup_path/manifest.txt" ] || return 1 + + return 0 +} + +create_backup() { + local backup_path="$1" + + echo "[INFO] [ORCHESTRATOR] Phase 3: Backup Creation" + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Cannot create backup because omnia_core is not running" + return 1 + fi + + if ! podman exec -u root omnia_core bash -c " + set -e + mkdir -p '$backup_path/input' '$backup_path/metadata' '$backup_path/configs' + + if [ -d '$OMNIA_INPUT_DIR' ]; then + cp -a '$OMNIA_INPUT_DIR' '$backup_path/' + fi + + if [ ! -f '$OMNIA_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 + exit 1 + fi + cp -a '$OMNIA_METADATA_FILE' '$backup_path/metadata/oim_metadata.yml' + + ts=\"\$(date -Iseconds)\" + hn=\"\$(hostname)\" + { + echo 'backup_version: 1.0' + echo \"timestamp: \$ts\" + echo 'source_container_tag: 1.0' + echo 'target_container_tag: 1.1' + echo 'source_omnia_release: 2.0.x' + echo 'target_omnia_release: 2.1.0.0' + echo \"hostname: \$hn\" + } > '$backup_path/manifest.txt' + "; then + echo "[ERROR] [ORCHESTRATOR] Failed to create backup inside omnia_core container" + return 1 + fi + + if [ -f "/etc/containers/systemd/omnia_core.container" ]; then + if ! podman cp "/etc/containers/systemd/omnia_core.container" "omnia_core:$backup_path/configs/omnia_core.container" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Failed to backup quadlet container file into container backup path" + return 1 + fi + fi + + echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_path" + echo "[INFO] [ORCHESTRATOR] Phase 3: Backup completed" + return 0 +} + +wait_for_container_health() { + local timeout="${1:-60}" + local i + + for i in $(seq 1 "$timeout"); do + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + return 0 + fi + sleep 1 + done + return 1 +} + +update_metadata_version() { + local metadata_file="$OMNIA_METADATA_FILE" + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running" + return 1 + fi + + podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$OMNIA_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$OMNIA_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$OMNIA_METADATA_FILE' + else + echo 'omnia_version: 2.1.0.0' >> '$OMNIA_METADATA_FILE' + fi + " +} + +sync_input_to_shared_path() { + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Cannot sync input because omnia_core is not running" + return 1 + fi + + if ! podman exec -u root omnia_core bash -c " + set -e + if [ -d /omnia/input ]; then + mkdir -p /opt/omnia/input/project_default + cp -r /omnia/input/* /opt/omnia/input/project_default + rm -rf /omnia/input + fi + "; then + echo "[ERROR] [ORCHESTRATOR] Failed to copy /omnia/input to /opt/omnia/input/project_default" + return 1 + fi + return 0 +} + +phase4_container_swap() { + echo "[INFO] [ORCHESTRATOR] Phase 4: Container Swap" + + if systemctl list-unit-files | grep -q "omnia_core.service"; then + systemctl stop omnia_core.service >/dev/null 2>&1 || true + fi + + if [ -z "${omnia_path}" ]; then + echo "[ERROR] [ORCHESTRATOR] Shared path (omnia_path) is empty. Phase 1 validation may not have run." + return 1 + fi + + if [ ! -f "/etc/containers/systemd/omnia_core.container" ]; then + echo "[ERROR] [ORCHESTRATOR] Quadlet file not found: /etc/containers/systemd/omnia_core.container" + echo "[ERROR] [ORCHESTRATOR] Cannot proceed with upgrade container swap" + return 1 + fi + + if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" + echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." + return 1 + fi + + if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' /etc/containers/systemd/omnia_core.container; then + echo "[ERROR] [ORCHESTRATOR] Failed to update Image in quadlet file" + return 1 + fi + + escaped_omnia_path=$(printf '%s\n' "$omnia_path" | sed 's/[\/&]/\\\\&/g') + if grep -q '^Volume=/omnia\(/\|:\)' /etc/containers/systemd/omnia_core.container; then + if ! sed -i "s|^Volume=/omnia\(/\|:\)|Volume=${escaped_omnia_path}\\1|g" /etc/containers/systemd/omnia_core.container; then + echo "[ERROR] [ORCHESTRATOR] Failed to update Volume paths in quadlet file" + return 1 + fi + fi + + systemctl daemon-reload || return 1 + if ! systemctl restart omnia_core.service; then + echo "[ERROR] [ORCHESTRATOR] Failed to restart omnia_core.service" + systemctl status omnia_core.service --no-pager -l || true + journalctl -xeu omnia_core.service --no-pager | tail -n 120 || true + return 1 + fi + + if ! wait_for_container_health 60; then + echo "[ERROR] [ORCHESTRATOR] Container failed health check after swap" + return 1 + fi + + if ! update_metadata_version; then + return 1 + fi + + if ! sync_input_to_shared_path; then + return 1 + fi + + init_ssh_config + + echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed" + return 0 +} + +upgrade_omnia_core() { + local lock_file="/var/lock/omnia_core_upgrade.lock" + local backup_path + + if [ -e "$lock_file" ]; then + echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}" + exit 1 + fi + + mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true + echo "$$" > "$lock_file" || { + echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}" + exit 1 + } + trap 'rm -f "$lock_file"' EXIT + + if ! phase1_validate; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" + exit 1 + fi + + if ! phase2_approval; then + exit 0 + fi + + backup_path="$OMNIA_UPGRADE_BACKUP_PATH" + if [ -z "$backup_path" ]; then + echo "[ERROR] [ORCHESTRATOR] Backup path is empty" + exit 1 + fi + + if ! create_backup "$backup_path"; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3" + exit 1 + fi + + if ! phase4_container_swap; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" + exit 1 + fi + + echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" + echo "[INFO] [ORCHESTRATOR] Backup location: $backup_path" + exit 0 +} + +# Main function to check if omnia_core container is already running. +# If yes, ask the user if they want to enter the container or reinstall. +# If no, set it up. +main() { + case "$1" in + --install|-i) + install_omnia_core + ;; + --uninstall|-u) + cleanup_omnia_core + ;; + --upgrade) + upgrade_omnia_core + ;; + --version|-v) + display_version + ;; + --help|-h|"") + show_help + ;; + *) + echo "Unknown option: $1" + show_help + exit 1 + ;; + esac +} + +# Call the main function +main "$1" From 2b69697dfedad273f93fafc887de2a4e0b80824c Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 9 Feb 2026 06:55:43 +0000 Subject: [PATCH 077/172] reduced inputs in each role --- utils/roles/slurm_cleanup/tasks/main.yml | 18 --- .../roles/slurm_config_backup/tasks/main.yml | 51 --------- .../slurm_config_rollback/tasks/main.yml | 84 -------------- utils/slurm_config_util.yml | 106 +++++++++++++++++- 4 files changed, 102 insertions(+), 157 deletions(-) diff --git a/utils/roles/slurm_cleanup/tasks/main.yml b/utils/roles/slurm_cleanup/tasks/main.yml index 5c59cae2d0..7acd38e571 100644 --- a/utils/roles/slurm_cleanup/tasks/main.yml +++ b/utils/roles/slurm_cleanup/tasks/main.yml @@ -1,23 +1,5 @@ --- -- name: Include variable file omnia_config.yml - ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" - tags: slurm_cleanup - -- name: Include storage vars - ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" - tags: slurm_cleanup - -- name: Set facts for slurm - ansible.builtin.set_fact: - nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" - tags: slurm_cleanup - -- name: Read the slurm mount point - ansible.builtin.set_fact: - share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" - tags: slurm_cleanup - - name: Set slurm_config_path ansible.builtin.set_fact: slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" diff --git a/utils/roles/slurm_config_backup/tasks/main.yml b/utils/roles/slurm_config_backup/tasks/main.yml index 4d01014180..401a086493 100644 --- a/utils/roles/slurm_config_backup/tasks/main.yml +++ b/utils/roles/slurm_config_backup/tasks/main.yml @@ -1,56 +1,5 @@ --- -- name: Include variable file omnia_config.yml - ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" - -- name: Include storage vars - ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" - -- name: Set facts for slurm - ansible.builtin.set_fact: - nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" - -- name: Read the slurm mount point - ansible.builtin.set_fact: - share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" - -- name: Display resolved slurm share path - ansible.builtin.debug: - msg: "Resolved share_path={{ share_path }} (nfs_storage_name={{ nfs_storage_name }})" - -- name: Slurp remote YAML file - ansible.builtin.slurp: - src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" - register: slurped_yaml - -- name: Parse YAML into vars - ansible.builtin.set_fact: - node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}" - -- name: Read the node name group - ansible.builtin.set_fact: - name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}" - -- name: Group the functional_groups - ansible.builtin.set_fact: - tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}" - -- name: Re-organize the groups - ansible.builtin.set_fact: - grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}" - loop: "{{ tmp_grouped_nodes }}" - -- name: Assign slurm lists - ansible.builtin.set_fact: - ctld_list: "{{ grouped_nodes | dict2items - | selectattr('key', 'match', '^' ~ 'slurm_control_node_') - | map(attribute='value') | list | flatten }}" - -- name: Fail if Slurm controller list is empty - ansible.builtin.fail: - msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun." - when: ctld_list | length == 0 - - name: Set slurm_config_path ansible.builtin.set_fact: slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" diff --git a/utils/roles/slurm_config_rollback/tasks/main.yml b/utils/roles/slurm_config_rollback/tasks/main.yml index 0a66c096b0..6e185f2028 100644 --- a/utils/roles/slurm_config_rollback/tasks/main.yml +++ b/utils/roles/slurm_config_rollback/tasks/main.yml @@ -1,89 +1,5 @@ --- -- name: Include variable file omnia_config.yml - ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" - tags: config_rollback - -- name: Include storage vars - ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" - tags: config_rollback - -- name: Set facts for slurm - ansible.builtin.set_fact: - nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" - tags: config_rollback - -- name: Read the slurm mount point - ansible.builtin.set_fact: - share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" - tags: config_rollback - -- name: Slurp remote YAML file - ansible.builtin.slurp: - src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" - register: slurped_yaml - tags: config_rollback - -- name: Parse YAML into vars - ansible.builtin.set_fact: - node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}" - tags: config_rollback - -- name: Get name and IP mapping 1 - ansible.builtin.set_fact: - tmp_ip_name_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='interfaces') }}" - tags: config_rollback - -- name: Get name and IP mapping 2 - ansible.builtin.set_fact: - ip_name_map: "{{ ip_name_map | default({}) | combine({item.key: item.value[0]['ip_addrs'][0]['ip_addr']}) }}" - loop: "{{ tmp_ip_name_map | dict2items }}" - tags: config_rollback - -- name: Read the node name group - ansible.builtin.set_fact: - name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}" - tags: config_rollback - -- name: Group the functional_groups - ansible.builtin.set_fact: - tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}" - tags: config_rollback - -- name: Re-organize the groups - ansible.builtin.set_fact: - grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}" - loop: "{{ tmp_grouped_nodes }}" - tags: config_rollback - -- name: Assign slurm lists - ansible.builtin.set_fact: - ctld_list: "{{ grouped_nodes | dict2items - | selectattr('key', 'match', '^' ~ 'slurm_control_node_') - | map(attribute='value') | list | flatten }}" - tags: config_rollback - -- name: Fail if Slurm controller list is empty - ansible.builtin.fail: - msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun." - when: ctld_list | length == 0 - tags: config_rollback - -- name: Set slurm controller IP - ansible.builtin.set_fact: - controller_ip: "{{ ip_name_map[ctld_list | first] }}" - when: ctld_list | length > 0 - tags: config_rollback - -- name: Add slurm controller as dynamic host - ansible.builtin.add_host: - name: slurm_controller - ansible_host: "{{ controller_ip }}" - ansible_user: root - ansible_port: 22 - when: controller_ip is defined - tags: config_rollback - - name: Set slurm paths ansible.builtin.set_fact: slurm_config_path: "{{ share_path }}/{{ slurm_share_dir_name }}" diff --git a/utils/slurm_config_util.yml b/utils/slurm_config_util.yml index 7cb5249ccd..fd42e4c202 100644 --- a/utils/slurm_config_util.yml +++ b/utils/slurm_config_util.yml @@ -17,10 +17,108 @@ hosts: oim connection: ssh gather_facts: true - roles: - - role: slurm_config_backup + tasks: + - name: Include variable file omnia_config.yml + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" + tags: always + + - name: Include storage vars + ansible.builtin.include_vars: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" + tags: always + + - name: Set facts for slurm + ansible.builtin.set_fact: + nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" + tags: always + + - name: Read the slurm mount point + ansible.builtin.set_fact: + share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).client_share_path }}" + tags: always + + - name: Slurp remote YAML file + ansible.builtin.slurp: + src: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" + register: slurped_yaml + tags: always + + - name: Parse YAML into vars + ansible.builtin.set_fact: + node_yaml: "{{ slurped_yaml.content | b64decode | from_yaml }}" + tags: always + + - name: Get name and IP mapping 1 + ansible.builtin.set_fact: + tmp_ip_name_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='interfaces') }}" + tags: always + + - name: Get name and IP mapping 2 + ansible.builtin.set_fact: + ip_name_map: "{{ ip_name_map | default({}) | combine({item.key: item.value[0]['ip_addrs'][0]['ip_addr']}) }}" + loop: "{{ tmp_ip_name_map | dict2items }}" + tags: always + + - name: Read the node name group + ansible.builtin.set_fact: + name_group_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='group') }}" + tags: always + + - name: Group the functional_groups + ansible.builtin.set_fact: + tmp_grouped_nodes: "{{ name_group_map | dict2items | groupby('value') }}" + tags: always + + - name: Re-organize the groups + ansible.builtin.set_fact: + grouped_nodes: "{{ grouped_nodes | default({}) | combine({item[0]: ((item[1] | items2dict).keys() | list)}) }}" + loop: "{{ tmp_grouped_nodes }}" + tags: always + + - name: Assign slurm lists + ansible.builtin.set_fact: + ctld_list: "{{ grouped_nodes | dict2items + | selectattr('key', 'match', '^' ~ 'slurm_control_node_') + | map(attribute='value') | list | flatten }}" + tags: always + + - name: Fail if Slurm controller list is empty + ansible.builtin.fail: + msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun." + when: ctld_list | length == 0 + tags: always + + - name: Set slurm controller IP + ansible.builtin.set_fact: + controller_ip: "{{ ip_name_map[ctld_list | first] }}" + when: ctld_list | length > 0 + tags: always + + - name: Add slurm controller as dynamic host + ansible.builtin.add_host: + name: slurm_controller + ansible_host: "{{ controller_ip }}" + ansible_user: root + ansible_port: 22 + when: controller_ip is defined + tags: always + + - name: Run slurm config backup + ansible.builtin.include_role: + name: slurm_config_backup + apply: + tags: config_backup tags: config_backup - - role: slurm_cleanup + + - name: Run slurm cleanup + ansible.builtin.include_role: + name: slurm_cleanup + apply: + tags: slurm_cleanup tags: slurm_cleanup - - role: slurm_config_rollback + + - name: Run slurm config rollback + ansible.builtin.include_role: + name: slurm_config_rollback + apply: + tags: config_rollback tags: config_rollback From b31391ddcca48280d11d96954d724a0bc7ba2ed8 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 9 Feb 2026 12:28:16 +0530 Subject: [PATCH 078/172] slurm parameters json fo reference in schema, not yet actively used in code --- .../schema/slurm_config_parameters.json | 501 ++++++++++++++++++ 1 file changed, 501 insertions(+) create mode 100644 common/library/module_utils/input_validation/schema/slurm_config_parameters.json diff --git a/common/library/module_utils/input_validation/schema/slurm_config_parameters.json b/common/library/module_utils/input_validation/schema/slurm_config_parameters.json new file mode 100644 index 0000000000..19480de228 --- /dev/null +++ b/common/library/module_utils/input_validation/schema/slurm_config_parameters.json @@ -0,0 +1,501 @@ +{ + "slurm.conf": { + "AccountingStorageBackupHost": "S_P_STRING", + "AccountingStorageEnforce": "S_P_STRING", + "AccountingStorageExternalHost": "S_P_STRING", + "AccountingStorageHost": "S_P_STRING", + "AccountingStorageParameters": "S_P_STRING", + "AccountingStoragePass": "S_P_STRING", + "AccountingStoragePort": "S_P_UINT16", + "AccountingStorageTRES": "S_P_STRING", + "AccountingStorageType": "S_P_STRING", + "AccountingStorageUser": "S_P_STRING", + "AccountingStoreFlags": "S_P_STRING", + "AccountingStoreJobComment": "S_P_BOOLEAN", + "AcctGatherEnergyType": "S_P_STRING", + "AcctGatherFilesystemType": "S_P_STRING", + "AcctGatherInfinibandType": "S_P_STRING", + "AcctGatherInterconnectType": "S_P_STRING", + "AcctGatherNodeFreq": "S_P_UINT16", + "AcctGatherProfileType": "S_P_STRING", + "AllowSpecResourcesUsage": "S_P_BOOLEAN", + "AuthAltParameters": "S_P_STRING", + "AuthAltTypes": "S_P_STRING", + "AuthInfo": "S_P_STRING", + "AuthType": "S_P_STRING", + "BackupAddr": "S_P_STRING", + "BackupController": "S_P_STRING", + "BatchStartTimeout": "S_P_UINT16", + "BcastExclude": "S_P_STRING", + "BcastParameters": "S_P_STRING", + "BurstBufferParameters": "S_P_STRING", + "BurstBufferType": "S_P_STRING", + "CertgenType": "S_P_STRING", + "CertgenParameters": "S_P_STRING", + "CertmgrType": "S_P_STRING", + "CertmgrParameters": "S_P_STRING", + "CliFilterParameters": "S_P_STRING", + "CliFilterPlugins": "S_P_STRING", + "ClusterName": "S_P_STRING", + "CommunicationParameters": "S_P_STRING", + "CompleteWait": "S_P_UINT16", + "ControlAddr": "S_P_STRING", + "ControlMachine": "S_P_STRING", + "CoreSpecPlugin": "S_P_STRING", + "CpuFreqDef": "S_P_STRING", + "CpuFreqGovernors": "S_P_STRING", + "CredType": "S_P_STRING", + "CryptoType": "S_P_STRING", + "DataParserParameters": "S_P_STRING", + "DebugFlags": "S_P_STRING", + "DefCPUPerGPU": "S_P_UINT64", + "DefMemPerCPU": "S_P_UINT64", + "DefMemPerGPU": "S_P_UINT64", + "DefMemPerNode": "S_P_UINT64", + "DependencyParameters": "S_P_STRING", + "DisableRootJobs": "S_P_BOOLEAN", + "EioTimeout": "S_P_UINT16", + "EnforcePartLimits": "S_P_STRING", + "Epilog": "S_P_ARRAY", + "EpilogMsgTime": "S_P_UINT32", + "EpilogSlurmctld": "S_P_ARRAY", + "EpilogTimeout": "S_P_UINT16", + "ExtSensorsFreq": "S_P_UINT16", + "ExtSensorsType": "S_P_STRING", + "FairShareDampeningFactor": "S_P_UINT16", + "FastSchedule": "S_P_UINT16", + "FederationParameters": "S_P_STRING", + "FirstJobId": "S_P_UINT32", + "GetEnvTimeout": "S_P_UINT16", + "GpuFreqDef": "S_P_STRING", + "GresTypes": "S_P_STRING", + "GroupUpdateForce": "S_P_UINT16", + "GroupUpdateTime": "S_P_UINT16", + "HashPlugin": "S_P_STRING", + "HealthCheckInterval": "S_P_UINT16", + "HealthCheckNodeState": "S_P_STRING", + "HealthCheckProgram": "S_P_STRING", + "HttpParserType": "S_P_STRING", + "InactiveLimit": "S_P_UINT16", + "InteractiveStepOptions": "S_P_STRING", + "JobAcctGatherFrequency": "S_P_STRING", + "JobAcctGatherParams": "S_P_STRING", + "JobAcctGatherType": "S_P_STRING", + "JobCompHost": "S_P_STRING", + "JobCompLoc": "S_P_STRING", + "JobCompParams": "S_P_STRING", + "JobCompPass": "S_P_STRING", + "JobCompPassScript": "S_P_STRING", + "JobCompPort": "S_P_UINT32", + "JobCompType": "S_P_STRING", + "JobCompUser": "S_P_STRING", + "JobContainerType": "S_P_STRING", + "JobCredentialPrivateKey": "S_P_STRING", + "JobCredentialPublicCertificate": "S_P_STRING", + "JobFileAppend": "S_P_UINT16", + "JobRequeue": "S_P_UINT16", + "JobSubmitPlugins": "S_P_STRING", + "KeepAliveTime": "S_P_UINT32", + "KillOnBadExit": "S_P_UINT16", + "KillWait": "S_P_UINT16", + "LaunchParameters": "S_P_STRING", + "LaunchType": "S_P_STRING", + "Licenses": "S_P_STRING", + "LogTimeFormat": "S_P_STRING", + "MailDomain": "S_P_STRING", + "MailProg": "S_P_STRING", + "MaxArraySize": "S_P_UINT32", + "MaxBatchRequeue": "S_P_UINT32", + "MaxDBDMsgs": "S_P_UINT32", + "MaxJobCount": "S_P_UINT32", + "MaxJobId": "S_P_UINT32", + "MaxMemPerCPU": "S_P_UINT64", + "MaxMemPerNode": "S_P_UINT64", + "MaxNodeCount": "S_P_UINT32", + "MaxStepCount": "S_P_UINT32", + "MaxTasksPerNode": "S_P_UINT16", + "MCSParameters": "S_P_STRING", + "MCSPlugin": "S_P_STRING", + "MessageTimeout": "S_P_UINT16", + "MetricsType": "S_P_STRING", + "MinJobAge": "S_P_UINT32", + "MpiDefault": "S_P_STRING", + "MpiParams": "S_P_STRING", + "NamespaceType": "S_P_STRING", + "NodeFeaturesPlugins": "S_P_STRING", + "OverTimeLimit": "S_P_UINT16", + "PluginDir": "S_P_STRING", + "PlugStackConfig": "S_P_STRING", + "PowerParameters": "S_P_STRING", + "PowerPlugin": "S_P_STRING", + "PreemptExemptTime": "S_P_STRING", + "PreemptMode": "S_P_STRING", + "PreemptParameters": "S_P_STRING", + "PreemptType": "S_P_STRING", + "PrEpParameters": "S_P_STRING", + "PrEpPlugins": "S_P_STRING", + "PriorityCalcPeriod": "S_P_STRING", + "PriorityDecayHalfLife": "S_P_STRING", + "PriorityFavorSmall": "S_P_BOOLEAN", + "PriorityFlags": "S_P_STRING", + "PriorityMaxAge": "S_P_STRING", + "PriorityParameters": "S_P_STRING", + "PrioritySiteFactorParameters": "S_P_STRING", + "PrioritySiteFactorPlugin": "S_P_STRING", + "PriorityType": "S_P_STRING", + "PriorityUsageResetPeriod": "S_P_STRING", + "PriorityWeightAge": "S_P_UINT32", + "PriorityWeightAssoc": "S_P_UINT32", + "PriorityWeightFairshare": "S_P_UINT32", + "PriorityWeightJobSize": "S_P_UINT32", + "PriorityWeightPartition": "S_P_UINT32", + "PriorityWeightQOS": "S_P_UINT32", + "PriorityWeightTRES": "S_P_STRING", + "PrivateData": "S_P_STRING", + "ProctrackType": "S_P_STRING", + "Prolog": "S_P_ARRAY", + "PrologEpilogTimeout": "S_P_UINT16", + "PrologFlags": "S_P_STRING", + "PrologSlurmctld": "S_P_ARRAY", + "PrologTimeout": "S_P_UINT16", + "PropagatePrioProcess": "S_P_UINT16", + "PropagateResourceLimits": "S_P_STRING", + "PropagateResourceLimitsExcept": "S_P_STRING", + "RebootProgram": "S_P_STRING", + "ReconfigFlags": "S_P_STRING", + "RequeueExit": "S_P_STRING", + "RequeueExitHold": "S_P_STRING", + "ResumeFailProgram": "S_P_STRING", + "ResumeProgram": "S_P_STRING", + "ResumeRate": "S_P_UINT16", + "ResumeTimeout": "S_P_UINT16", + "ResvEpilog": "S_P_STRING", + "ResvOverRun": "S_P_UINT16", + "ResvProlog": "S_P_STRING", + "ReturnToService": "S_P_UINT16", + "RoutePlugin": "S_P_STRING", + "SallocDefaultCommand": "S_P_STRING", + "SbcastParameters": "S_P_STRING", + "SchedulerParameters": "S_P_STRING", + "SchedulerTimeSlice": "S_P_UINT16", + "SchedulerType": "S_P_STRING", + "ScronParameters": "S_P_STRING", + "SelectType": "S_P_STRING", + "SelectTypeParameters": "S_P_STRING", + "SlurmctldAddr": "S_P_STRING", + "SlurmctldDebug": "S_P_STRING", + "SlurmctldLogFile": "S_P_STRING", + "SlurmctldParameters": "S_P_STRING", + "SlurmctldPidFile": "S_P_STRING", + "SlurmctldPort": "S_P_STRING", + "SlurmctldPrimaryOffProg": "S_P_STRING", + "SlurmctldPrimaryOnProg": "S_P_STRING", + "SlurmctldSyslogDebug": "S_P_STRING", + "SlurmctldTimeout": "S_P_UINT16", + "SlurmdDebug": "S_P_STRING", + "SlurmdLogFile": "S_P_STRING", + "SlurmdParameters": "S_P_STRING", + "SlurmdPidFile": "S_P_STRING", + "SlurmdPort": "S_P_UINT32", + "SlurmdSpoolDir": "S_P_STRING", + "SlurmdSyslogDebug": "S_P_STRING", + "SlurmdTimeout": "S_P_UINT16", + "SlurmdUser": "S_P_STRING", + "SlurmSchedLogFile": "S_P_STRING", + "SlurmSchedLogLevel": "S_P_UINT16", + "SlurmUser": "S_P_STRING", + "SrunEpilog": "S_P_STRING", + "SrunPortRange": "S_P_STRING", + "SrunProlog": "S_P_STRING", + "StateSaveLocation": "S_P_STRING", + "SuspendExcNodes": "S_P_STRING", + "SuspendExcParts": "S_P_STRING", + "SuspendExcStates": "S_P_STRING", + "SuspendProgram": "S_P_STRING", + "SuspendRate": "S_P_UINT16", + "SuspendTime": "S_P_STRING", + "SuspendTimeout": "S_P_UINT16", + "SwitchParameters": "S_P_STRING", + "SwitchType": "S_P_STRING", + "TaskEpilog": "S_P_STRING", + "TaskPlugin": "S_P_STRING", + "TaskPluginParam": "S_P_STRING", + "TaskProlog": "S_P_STRING", + "TCPTimeout": "S_P_UINT16", + "TLSParameters": "S_P_STRING", + "TLSType": "S_P_STRING", + "TmpFS": "S_P_STRING", + "TopologyParam": "S_P_STRING", + "TopologyPlugin": "S_P_STRING", + "TrackWCKey": "S_P_BOOLEAN", + "TreeWidth": "S_P_UINT16", + "UnkillableStepProgram": "S_P_STRING", + "UnkillableStepTimeout": "S_P_UINT16", + "UrlParserType": "S_P_STRING", + "UsePAM": "S_P_BOOLEAN", + "VSizeFactor": "S_P_UINT16", + "WaitTime": "S_P_UINT16", + "X11Parameters": "S_P_STRING", + "DownNodes": "S_P_ARRAY", + "NodeName": "S_P_ARRAY", + "NodeSet": "S_P_ARRAY", + "PartitionName": "S_P_ARRAY", + "SlurmctldHost": "S_P_ARRAY" + }, + "slurmdbd.conf": { + "AllowNoDefAcct": "S_P_BOOLEAN", + "AllResourcesAbsolute": "S_P_BOOLEAN", + "ArchiveDir": "S_P_STRING", + "ArchiveEvents": "S_P_BOOLEAN", + "ArchiveJobs": "S_P_BOOLEAN", + "ArchiveResvs": "S_P_BOOLEAN", + "ArchiveScript": "S_P_STRING", + "ArchiveSteps": "S_P_BOOLEAN", + "ArchiveSuspend": "S_P_BOOLEAN", + "ArchiveTXN": "S_P_BOOLEAN", + "ArchiveUsage": "S_P_BOOLEAN", + "AuthAltTypes": "S_P_STRING", + "AuthAltParameters": "S_P_STRING", + "AuthInfo": "S_P_STRING", + "AuthType": "S_P_STRING", + "CommitDelay": "S_P_UINT16", + "CommunicationParameters": "S_P_STRING", + "DbdAddr": "S_P_STRING", + "DbdBackupHost": "S_P_STRING", + "DbdHost": "S_P_STRING", + "DbdPort": "S_P_UINT16", + "DebugFlags": "S_P_STRING", + "DebugLevel": "S_P_STRING", + "DebugLevelSyslog": "S_P_STRING", + "DefaultQOS": "S_P_STRING", + "DisableCoordDBD": "S_P_BOOLEAN", + "DisableArchiveCommands": "S_P_BOOLEAN", + "HashPlugin": "S_P_STRING", + "JobPurge": "S_P_UINT32", + "LogFile": "S_P_STRING", + "LogTimeFormat": "S_P_STRING", + "MaxPurgeLimit": "S_P_UINT32", + "MaxQueryTimeRange": "S_P_STRING", + "MessageTimeout": "S_P_UINT16", + "Parameters": "S_P_STRING", + "PidFile": "S_P_STRING", + "PluginDir": "S_P_STRING", + "PrivateData": "S_P_STRING", + "PurgeEventAfter": "S_P_STRING", + "PurgeJobAfter": "S_P_STRING", + "PurgeResvAfter": "S_P_STRING", + "PurgeStepAfter": "S_P_STRING", + "PurgeSuspendAfter": "S_P_STRING", + "PurgeTXNAfter": "S_P_STRING", + "PurgeUsageAfter": "S_P_STRING", + "PurgeEventMonths": "S_P_UINT32", + "PurgeJobMonths": "S_P_UINT32", + "PurgeStepMonths": "S_P_UINT32", + "PurgeSuspendMonths": "S_P_UINT32", + "PurgeTXNMonths": "S_P_UINT32", + "PurgeUsageMonths": "S_P_UINT32", + "SlurmUser": "S_P_STRING", + "StepPurge": "S_P_UINT32", + "StorageBackupHost": "S_P_STRING", + "StorageHost": "S_P_STRING", + "StorageLoc": "S_P_STRING", + "StorageParameters": "S_P_STRING", + "StoragePass": "S_P_STRING", + "StoragePassScript": "S_P_STRING", + "StoragePort": "S_P_UINT16", + "StorageType": "S_P_STRING", + "StorageUser": "S_P_STRING", + "TCPTimeout": "S_P_UINT16", + "TLSParameters": "S_P_STRING", + "TLSType": "S_P_STRING", + "TrackWCKey": "S_P_BOOLEAN", + "TrackSlurmctldDown": "S_P_BOOLEAN" + }, + "cgroup.conf": { + "CgroupAutomount": "S_P_BOOLEAN", + "CgroupMountpoint": "S_P_STRING", + "CgroupSlice": "S_P_STRING", + "ConstrainCores": "S_P_BOOLEAN", + "ConstrainRAMSpace": "S_P_BOOLEAN", + "AllowedRAMSpace": "S_P_FLOAT", + "MaxRAMPercent": "S_P_FLOAT", + "MinRAMSpace": "S_P_UINT64", + "ConstrainSwapSpace": "S_P_BOOLEAN", + "AllowedSwapSpace": "S_P_FLOAT", + "MaxSwapPercent": "S_P_FLOAT", + "MemoryLimitEnforcement": "S_P_BOOLEAN", + "MemoryLimitThreshold": "S_P_FLOAT", + "ConstrainDevices": "S_P_BOOLEAN", + "AllowedDevicesFile": "S_P_STRING", + "MemorySwappiness": "S_P_UINT64", + "CgroupPlugin": "S_P_STRING", + "IgnoreSystemd": "S_P_BOOLEAN", + "IgnoreSystemdOnFailure": "S_P_BOOLEAN", + "EnableControllers": "S_P_BOOLEAN", + "EnableExtraControllers": "S_P_STRING", + "SignalChildrenProcesses": "S_P_BOOLEAN", + "SystemdTimeout": "S_P_UINT64" + }, + "gres.conf": { + "AutoDetect": "S_P_STRING", + "Count": "S_P_STRING", + "CPUs": "S_P_STRING", + "Cores": "S_P_STRING", + "File": "S_P_STRING", + "Files": "S_P_STRING", + "Flags": "S_P_STRING", + "Link": "S_P_STRING", + "Links": "S_P_STRING", + "MultipleFiles": "S_P_STRING", + "Name": "S_P_STRING", + "Type": "S_P_STRING" + }, + "oci.conf": { + "ContainerPath": "S_P_STRING", + "CreateEnvFile": "S_P_STRING", + "DisableHooks": "S_P_STRING", + "EnvExclude": "S_P_STRING", + "MountSpoolDir": "S_P_STRING", + "RunTimeCreate": "S_P_STRING", + "RunTimeDelete": "S_P_STRING", + "RunTimeKill": "S_P_STRING", + "RunTimeEnvExclude": "S_P_STRING", + "RunTimeQuery": "S_P_STRING", + "RunTimeRun": "S_P_STRING", + "RunTimeStart": "S_P_STRING", + "SrunPath": "S_P_STRING", + "SrunArgs": "S_P_ARRAY", + "DisableCleanup": "S_P_BOOLEAN", + "StdIODebug": "S_P_STRING", + "SyslogDebug": "S_P_STRING", + "FileDebug": "S_P_STRING", + "DebugFlags": "S_P_STRING", + "IgnoreFileConfigJson": "S_P_BOOLEAN" + }, + "acct_gather.conf": { + "EnergyIPMIDriverType": "S_P_UINT32", + "EnergyIPMIDisableAutoProbe": "S_P_UINT32", + "EnergyIPMIDriverAddress": "S_P_UINT32", + "EnergyIPMIRegisterSpacing": "S_P_UINT32", + "EnergyIPMIDriverDevice": "S_P_STRING", + "EnergyIPMIProtocolVersion": "S_P_UINT32", + "EnergyIPMIUsername": "S_P_STRING", + "EnergyIPMIPassword": "S_P_STRING", + "EnergyIPMIPrivilegeLevel": "S_P_UINT32", + "EnergyIPMIAuthenticationType": "S_P_UINT32", + "EnergyIPMICipherSuiteId": "S_P_UINT32", + "EnergyIPMISessionTimeout": "S_P_UINT32", + "EnergyIPMIRetransmissionTimeout": "S_P_UINT32", + "EnergyIPMIWorkaroundFlags": "S_P_UINT32", + "EnergyIPMIRereadSdrCache": "S_P_BOOLEAN", + "EnergyIPMIIgnoreNonInterpretableSensors": "S_P_BOOLEAN", + "EnergyIPMIBridgeSensors": "S_P_BOOLEAN", + "EnergyIPMIInterpretOemData": "S_P_BOOLEAN", + "EnergyIPMISharedSensors": "S_P_BOOLEAN", + "EnergyIPMIDiscreteReading": "S_P_BOOLEAN", + "EnergyIPMIIgnoreScanningDisabled": "S_P_BOOLEAN", + "EnergyIPMIAssumeBmcOwner": "S_P_BOOLEAN", + "EnergyIPMIEntitySensorNames": "S_P_BOOLEAN", + "EnergyIPMIFrequency": "S_P_UINT32", + "EnergyIPMICalcAdjustment": "S_P_BOOLEAN", + "EnergyIPMIPowerSensors": "S_P_STRING", + "EnergyIPMITimeout": "S_P_UINT32", + "EnergyIPMIVariable": "S_P_STRING", + "ProfileHDF5Dir": "S_P_STRING", + "ProfileHDF5Default": "S_P_STRING", + "ProfileInfluxDBDatabase": "S_P_STRING", + "ProfileInfluxDBDefault": "S_P_STRING", + "ProfileInfluxDBFrequency": "S_P_UINT32", + "ProfileInfluxDBHost": "S_P_STRING", + "ProfileInfluxDBPass": "S_P_STRING", + "ProfileInfluxDBRTPolicy": "S_P_STRING", + "ProfileInfluxDBTimeout": "S_P_UINT32", + "ProfileInfluxDBUser": "S_P_STRING", + "InterconnectOFEDPort": "S_P_UINT32", + "InfinibandOFEDPort": "S_P_UINT32", + "SysfsInterfaces": "S_P_STRING" + }, + "burst_buffer.conf": { + "AllowUsers": "S_P_STRING", + "CreateBuffer": "S_P_STRING", + "DefaultPool": "S_P_STRING", + "DenyUsers": "S_P_STRING", + "DestroyBuffer": "S_P_STRING", + "Directive": "S_P_STRING", + "Flags": "S_P_STRING", + "GetSysState": "S_P_STRING", + "GetSysStatus": "S_P_STRING", + "Granularity": "S_P_STRING", + "OtherTimeout": "S_P_UINT32", + "PollInterval": "S_P_UINT32", + "Pools": "S_P_STRING", + "StageInTimeout": "S_P_UINT32", + "StageOutTimeout": "S_P_UINT32", + "StartStageIn": "S_P_STRING", + "StartStageOut": "S_P_STRING", + "StopStageIn": "S_P_STRING", + "StopStageOut": "S_P_STRING", + "ValidateTimeout": "S_P_UINT32" + }, + "helpers.conf": { + "AllowUserBoot": "S_P_STRING", + "BootTime": "S_P_UINT32", + "ExecTime": "S_P_UINT32", + "Feature": "S_P_ARRAY", + "MutuallyExclusive": "S_P_LIST", + "NodeName": "S_P_ARRAY" + }, + "job_container.conf": { + "AutoBasePath": "S_P_BOOLEAN", + "BasePath": "S_P_ARRAY", + "EntireStepInNS": "S_P_BOOLEAN", + "InitScript": "S_P_STRING", + "Shared": "S_P_BOOLEAN", + "CloneNSScript": "S_P_STRING", + "CloneNSEpilog": "S_P_STRING", + "CloneNSScript_Wait": "S_P_UINT32", + "CloneNSEpilog_Wait": "S_P_UINT32" + }, + "mpi.conf": { + "PMIxCliTmpDirBase": "S_P_STRING", + "PMIxCollFence": "S_P_STRING", + "PMIxDebug": "S_P_UINT32", + "PMIxDirectConn": "S_P_BOOLEAN", + "PMIxDirectConnEarly": "S_P_BOOLEAN", + "PMIxDirectConnUCX": "S_P_BOOLEAN", + "PMIxDirectSameArch": "S_P_BOOLEAN", + "PMIxEnv": "S_P_STRING", + "PMIxFenceBarrier": "S_P_BOOLEAN", + "PMIxNetDevicesUCX": "S_P_STRING", + "PMIxShareServerTopology": "S_P_BOOLEAN", + "PMIxTimeout": "S_P_UINT32", + "PMIxTlsUCX": "S_P_CSV" + }, + "topology.conf": { + "SwitchName": "S_P_ARRAY", + "LinkSpeed": "S_P_UINT32", + "Nodes": "S_P_STRING", + "Switches": "S_P_STRING", + "BlockName": "S_P_ARRAY", + "BlockSizes": "S_P_STRING" + }, + "type_definitions": { + "S_P_IGNORE": "Any instance of specified key and associated value in a file will be allowed, but the value will not be stored", + "S_P_STRING": "String value", + "S_P_PLAIN_STRING": "Plain string value (not expanded in S_P_EXPLINE contexts)", + "S_P_LONG": "Long integer value", + "S_P_UINT16": "Unsigned 16-bit integer", + "S_P_UINT32": "Unsigned 32-bit integer", + "S_P_UINT64": "Unsigned 64-bit integer", + "S_P_POINTER": "Pointer type (custom handler)", + "S_P_ARRAY": "Array of values (allows multiple occurrences)", + "S_P_LIST": "List of values (allows multiple occurrences)", + "S_P_CSV": "Comma-separated values", + "S_P_BOOLEAN": "Boolean value (true/false, yes/no)", + "S_P_LINE": "Nested configuration line with sub-options", + "S_P_EXPLINE": "Expanded line with hostlist expansion support", + "S_P_FLOAT": "Floating point value", + "S_P_DOUBLE": "Double precision floating point", + "S_P_LONG_DOUBLE": "Long double precision floating point" + } +} From 6030165253d7817f19f4f407646dc5e169a54a11 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Mon, 9 Feb 2026 13:03:21 +0530 Subject: [PATCH 079/172] Update config.py Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- .../library/module_utils/input_validation/common_utils/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 4de8aafa88..a2cd35b9c4 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -76,6 +76,7 @@ "storage": [files["storage_config"]], "prepare_oim": [ files["network_spec"], + files["software_config"] ], # "high_availability": [files["high_availability_config"]], # "additional_software": [files["additional_software"]], From 220ef45651136da5504ba8904d82410ace242a00 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Mon, 9 Feb 2026 08:49:03 +0000 Subject: [PATCH 080/172] Enhance input validation for powervault config in storage_config.yml Signed-off-by: Vrinda_Marwah --- .../schema/storage_config.json | 11 +++++--- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 2 +- input/storage_config.yml | 25 ++++++++----------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json index 41746905f1..114d88f525 100644 --- a/common/library/module_utils/input_validation/schema/storage_config.json +++ b/common/library/module_utils/input_validation/schema/storage_config.json @@ -51,7 +51,8 @@ "minItems": 1 }, "powervault_config": { - "required": ["ip", "isci_initiators", "volume_id"], + "type": "object", + "required": ["ip", "iscsi_initiators", "volume_id"], "properties": { "ip": { "description": "List of target controller IP addresses", @@ -69,14 +70,16 @@ "type": "integer" }, - "isci_initiators": { + "iscsi_initiators": { "description": "iSCSI initiator IQN", - "type": "string" + "type": "string", + "pattern": "^iqn\\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$" }, "volume_id": { "description": "Volume identifier (hex string)", - "type": "string" + "type": "string", + "pattern": "^[a-fA-F0-9]+$" } } } diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2f2721d7eb..d99d9dc90f 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -92,7 +92,7 @@ PORTALS=({% for ip in powervault_config.ip %}"{{ ip }}" {% endfor %}) PORT="{{ powervault_config.port | default(3260) }}" - INITIATOR_IQN="{{ powervault_config.isci_initiators | default('') }}" + INITIATOR_IQN="{{ powervault_config.iscsi_initiators | default('') }}" VOLUME_ID="{{ powervault_config.volume_id | default('') }}" FS_TYPE="{{ powervault_config.fs_type | default('xfs') }}" MOUNT_OPTS="{{ powervault_config.mount_options | default('defaults,_netdev,noatime') }}" diff --git a/input/storage_config.yml b/input/storage_config.yml index 48eac2d5cc..3ad961b405 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -19,28 +19,23 @@ # -----------------------------Powervault------------------------------------------- # powervault_config -# ip: ipv4 -# A list of PowerVault controller IP addresses used for iSCSI target discovery and login. -# In this configuration, a single controller portal is provided. - -# port: -# Defines the TCP port for the iSCSI target service. -# Port 3260 is the standard port for iSCSI communication. +# Mandatory when using PowerVault for persistent storage. +# Below parameters are mandatory when powervault_config is defined + # ip: A list of PowerVault controller ipv4 addresses used for iSCSI target discovery and login. + # iscsi_initiators: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array. + # volume_id: This is the unique WWN/identifier for the specific volume that should be used for persistent storage. This value is used for multipath scanning to select the correct mapped device. -# isci_initiators: -# Specifies the InitiatorName used by the host when connecting to the iSCSI target. -# This IQN uniquely identifies the host to the storage array. +# Below are the optional parameters when powervault_config is defined + # port: Defines the TCP port for the iSCSI target service. When port is not specified, default port used will be 3260 -# volume_id: -# This is the unique WWN/identifier for the -# specific volume that should be used for persistent storage. -# The script uses this value during multipath scanning to select the correct mapped device +# Below is an example on how to configure powervault_config +# In this configuration, a single controller portal is provided. #powervault_config: # ip: # - 172.1.2.3 # port: 3260 -# isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 +# iscsi_initiators: iqn.initiator.com.example:7d7d7d7d7d7 # volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 From 8f02f5c460d5d0bf7939474c7a78aaf620f20dcc Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Mon, 9 Feb 2026 08:51:29 +0000 Subject: [PATCH 081/172] Update copyright in storage_config.yml Signed-off-by: Vrinda_Marwah --- input/storage_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/storage_config.yml b/input/storage_config.yml index 3ad961b405..9492f15558 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 9a44bcf3195d93f42b8a015e01e8f830a27dcfe7 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Mon, 9 Feb 2026 14:49:22 +0530 Subject: [PATCH 082/172] config file update Signed-off-by: pullan1 --- common/library/module_utils/local_repo/config.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 5c515c527c..e26e8a6e71 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -142,10 +142,7 @@ "get_repo_version": "pulp container repository show --href %s", "list_tags_by_version": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s", "rename_repository": "pulp container repository update --name %s --new-name %s", - "orphan_cleanup": "pulp orphan cleanup" - - - "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", + "orphan_cleanup": "pulp orphan cleanup", "container_distribution_show": "pulp container distribution show --name %s | jq .repository", "show_repository_version": "pulp container repository show --href %s | jq .latest_version_href", "list_image_tags": "pulp show --href /pulp/api/v3/content/container/tags/?repository_version=%s" From 88c526923f7bc3b16681e4d7f766bcc8fc09efd5 Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Mon, 9 Feb 2026 15:14:24 +0530 Subject: [PATCH 083/172] Skip podman image pull when image already exists for pulp and openchami --- .../openchami/tasks/deployment_prereq.yml | 19 +++++++++++++++---- .../pulp/tasks/deployment_prereq.yml | 15 +++++++++++++-- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml index 109bc725f3..1558152a50 100644 --- a/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml +++ b/prepare_oim/roles/deploy_containers/openchami/tasks/deployment_prereq.yml @@ -13,18 +13,29 @@ # limitations under the License. --- -- name: Pull OpenCHAMI images using Podman +- name: Check if OpenCHAMI images already exist ansible.builtin.command: - cmd: "podman pull {{ item }}" + cmd: "podman image exists {{ item }}" loop: "{{ openchami_images }}" + register: openchami_image_exists + changed_when: false + failed_when: false + +- name: Pull OpenCHAMI images using Podman when missing + ansible.builtin.command: + cmd: "podman pull {{ item.item }}" + loop: "{{ openchami_image_exists.results }}" + loop_control: + label: "{{ item.item }}" register: pull_result retries: "{{ pull_image_retries }}" delay: "{{ pull_image_delay }}" until: pull_result.rc == 0 changed_when: false + when: item.rc != 0 - name: Fail if any OpenCHAMI image pull failed ansible.builtin.fail: msg: "Failed to pull OpenCHAMI image: {{ item.item }}. Error: {{ item.stderr }}" - loop: "{{ pull_result.results }}" - when: item.rc != 0 + loop: "{{ pull_result.results | default([]) }}" + when: item.rc is defined and item.rc != 0 diff --git a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml index 09ec52e6a4..dc143b03c5 100644 --- a/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml +++ b/prepare_oim/roles/deploy_containers/pulp/tasks/deployment_prereq.yml @@ -38,7 +38,14 @@ when: hostname_enabled no_log: true -- name: Pull Pulp image using Podman +- name: Check if Pulp image already exists + ansible.builtin.command: + cmd: "podman image exists {{ pulp_image }}" + register: pulp_image_exists + changed_when: false + failed_when: false + +- name: Pull Pulp image using Podman when missing ansible.builtin.command: cmd: "podman pull {{ pulp_image }}" register: pulp_pull_result @@ -46,11 +53,15 @@ delay: "{{ pull_image_delay }}" until: pulp_pull_result is not failed changed_when: false + when: pulp_image_exists.rc != 0 - name: Fail if Pulp image pull failed ansible.builtin.fail: msg: "Failed to pull Pulp image: {{ pulp_image }}. Error: {{ pulp_pull_result.stderr }}" - when: pulp_pull_result.rc != 0 + when: + - pulp_image_exists.rc != 0 + - pulp_pull_result.rc is defined + - pulp_pull_result.rc != 0 - name: Invoke Pulp Container Deployment Tasks for HTTP ansible.builtin.include_tasks: deploy_pulp_container_http.yml From d4cdf690b31f2a97dd05616aa10fe54140e7f770 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:18:16 +0530 Subject: [PATCH 084/172] defect fix for local repo validation when subscription is enabled --- .../input_validation/common_utils/config.py | 8 +- .../validation_flows/local_repo_validation.py | 100 ++++++++++++------ .../roles/validate_input/tasks/main.yml | 26 +++-- .../roles/validate_input/vars/main.yml | 3 + 4 files changed, 92 insertions(+), 45 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 4de8aafa88..b5d3676165 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -26,8 +26,12 @@ # log path for input validator INPUT_VALIDATOR_LOG_PATH = '/opt/omnia/log/core/playbooks/' -ENTITLEMENT_PEM = '/opt/omnia/rhel_repo_certs/*.pem' -REDHAT_REPO_FILE = '/opt/omnia/rhel_repo_certs/redhat.repo' +# Subscription checking paths - checked in order of priority +SYSTEM_ENTITLEMENT_PATH = '/etc/pki/entitlement/*.pem' +SYSTEM_REDHAT_REPO = '/etc/yum.repos.d/redhat.repo' + +OMNIA_ENTITLEMENT_PATH = '/opt/omnia/rhel_repo_certs/*.pem' +OMNIA_REDHAT_REPO = '/opt/omnia/rhel_repo_certs/redhat.repo' # dict to hold the file names. If any file's name changes just change it here. files = { diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index bcec9f4197..343a4f3de1 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -29,43 +29,77 @@ def check_subscription_status(logger=None): """ - Check if the system has an active Red Hat subscription. - Subscription status is considered True if either entitlement - certificates exist or the required Red Hat repository URLs are present. - - Checks mounted host paths (/etc/pki/entitlement, /etc/yum.repos.d/redhat.repo). + Check if the system has an active Red Hat subscription enabled. + If system entitlement certificates are found in /etc/pki/entitlement, + only system paths are checked. Otherwise, Omnia paths are checked. + Subscription is enabled only if entitlement certificates and required + Red Hat repository URLs are found in the same source (system or Omnia). Returns: - bool: True if the system is subscribed (either entitlement certs - exist or required repos are present), False otherwise. - """ - # 1. Check entitlement certs - entitlement_certs = glob.glob(config.ENTITLEMENT_PEM) - has_entitlement = len(entitlement_certs) > 0 - if logger: - logger.info(f"Entitlement certs in {config.ENTITLEMENT_PEM}: {len(entitlement_certs)} found") - - # 2. Check redhat repos in redhat.repo + bool: True if subscription is enabled (both entitlement certs + and repos are found in the same source), False otherwise. + """ + # 1. Check system entitlement certs first + system_entitlement_certs = glob.glob(config.SYSTEM_ENTITLEMENT_PATH) + has_system_entitlement = len(system_entitlement_certs) > 0 + + if has_system_entitlement: + # System entitlement found - use system paths only + entitlement_certs = system_entitlement_certs + has_entitlement = True + repo_file_to_check = config.SYSTEM_REDHAT_REPO + + if logger: + logger.info(f"Found {len(system_entitlement_certs)} system entitlement certs - using system paths only") + else: + # No system entitlement - check Omnia paths + omnia_entitlement_certs = glob.glob(config.OMNIA_ENTITLEMENT_PATH) + entitlement_certs = omnia_entitlement_certs + has_entitlement = len(omnia_entitlement_certs) > 0 + repo_file_to_check = config.OMNIA_REDHAT_REPO + + if logger: + logger.info(f"No system entitlement found - checking Omnia paths: {len(omnia_entitlement_certs)} certs found") + + # 2. Check repos based on which entitlement path was used + has_repos = False repo_urls = [] - redhat_repo = config.REDHAT_REPO_FILE - if os.path.exists(redhat_repo): - with open(redhat_repo, "r") as f: - for line in f: - if line.startswith("baseurl ="): - url = line.split("=", 1)[1].strip() - if re.search(r"(codeready-builder|baseos|appstream)", url, re.IGNORECASE): - repo_urls.append(url) - - has_repos = len(repo_urls) > 0 - if logger: - logger.info(f"Repo URLs in {redhat_repo}: {len(repo_urls)} found") - - # 3. Subscription status logic - subscription_status = has_entitlement or has_repos + redhat_repo_used = None + + if os.path.exists(repo_file_to_check): + try: + with open(repo_file_to_check, "r") as f: + for line in f: + if line.startswith("baseurl ="): + url = line.split("=", 1)[1].strip() + if re.search(r"(codeready-builder|baseos|appstream)", url, re.IGNORECASE): + repo_urls.append(url) + + if repo_urls: + has_repos = True + redhat_repo_used = repo_file_to_check + if logger: + logger.info(f"Found {len(repo_urls)} repo URLs in {repo_file_to_check}") + elif logger: + logger.info(f"No required repo URLs found in {repo_file_to_check}") + except (IOError, OSError) as e: + if logger: + logger.warning(f"Error reading {repo_file_to_check}: {e}") + elif logger: + logger.info(f"Repo file {repo_file_to_check} does not exist") + + # 3. Subscription enabled if entitlement and repos are found in the same source + subscription_enabled = has_entitlement and has_repos + if logger: - logger.info(f"Subscription status: {subscription_status} (entitlement={has_entitlement}, repos={has_repos})") - - return subscription_status + logger.info( + f"Subscription enabled: {subscription_enabled} " + f"(entitlement={has_entitlement}, repos={has_repos}, " + f"entitlement_source={entitlement_certs[0] if entitlement_certs else 'None'}, " + f"repo_source={redhat_repo_used})" + ) + + return subscription_enabled # Below is a validation function for each file in the input folder def validate_local_repo_config(input_file_path, data, diff --git a/input_validation/roles/validate_input/tasks/main.yml b/input_validation/roles/validate_input/tasks/main.yml index ff11c79950..6a1c773ee5 100644 --- a/input_validation/roles/validate_input/tasks/main.yml +++ b/input_validation/roles/validate_input/tasks/main.yml @@ -23,14 +23,20 @@ # then the "all" tag should be removed so that only the config files related to that playbook are validated. input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2 else omnia_run_tags | default([]) }}" - validate_input: - omnia_base_dir: "{{ (input_dir + '/../') | ansible.builtin.realpath }}" - project_name: "{{ project_name }}" - tag_names: "{{ input_validate_tags }}" - module_utils_path: "{{ (role_path + '/../../../common/library/module_utils/') | ansible.builtin.realpath }}" - register: validation_status - when: (input_validate_tags | length) > 0 + block: + - name: Run validation + validate_input: + omnia_base_dir: "{{ (input_dir + '/../') | ansible.builtin.realpath }}" + project_name: "{{ project_name }}" + tag_names: "{{ input_validate_tags }}" + module_utils_path: "{{ (role_path + '/../../../common/library/module_utils/') | ansible.builtin.realpath }}" + register: validation_status + when: (input_validate_tags | length) > 0 -- name: Debug validation status - ansible.builtin.debug: - msg: "{{ messages.validation_success }}" + - name: Debug validation status + ansible.builtin.debug: + msg: "{{ messages.validation_success }}" + rescue: + - name: Failed due to validation failure + ansible.builtin.fail: + msg: "{{ messages.validation_error }}" diff --git a/input_validation/roles/validate_input/vars/main.yml b/input_validation/roles/validate_input/vars/main.yml index 3c6f2b1aff..4655e7b25a 100644 --- a/input_validation/roles/validate_input/vars/main.yml +++ b/input_validation/roles/validate_input/vars/main.yml @@ -18,3 +18,6 @@ project_name: "{{ hostvars['localhost']['project_name'] }}" messages: validation_success: "Successfully validated Omnia input config file(s)" + validation_error: > + Input validation failed. + For detailed validation errors, see: {{ ansible_failed_result.log_file }} From c4f60eebc7e7dc5029bda6b51a091ea94f12c328 Mon Sep 17 00:00:00 2001 From: Kratika_Patidar Date: Mon, 9 Feb 2026 10:06:28 +0000 Subject: [PATCH 085/172] defect fix input validation mismatch between login_compiler and slurm_node --- .../validation_flows/provision_validation.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 7eef7bef20..cc6b4d8e76 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -91,6 +91,65 @@ def validate_functional_groups_separation(pxe_mapping_file_path): if errors: raise ValueError("PXE mapping file group separation validation errors: " + "; ".join([str(e) for e in errors])) +def validate_slurm_login_compiler_prefix(pxe_mapping_file_path): + """Validate that slurm_node and login_compiler entries align on architecture suffix when both are present. + + - Functional group suffix must be either _x86_64 or _aarch64 (case-sensitive). + - When both slurm_node* and login_compiler_node* are present, their suffixes must match. + + Raises ValueError with details if suffixes differ. Prefix differences are allowed. + """ + + if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): + raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}") + + with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: + raw_lines = fh.readlines() + + non_comment_lines = [ln for ln in raw_lines if ln.strip()] + reader = csv.DictReader(non_comment_lines) + + fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} + fg_col = fieldname_map.get("FUNCTIONAL_GROUP_NAME") + hostname_col = fieldname_map.get("HOSTNAME") + + if not fg_col or not hostname_col: + raise ValueError("FUNCTIONAL_GROUP_NAME or HOSTNAME column not found in PXE mapping file") + + arch_map = {"slurm_node": [], "login_compiler_node": []} + + for row_idx, row in enumerate(reader, start=2): + fg_name = row.get(fg_col, "").strip() if row.get(fg_col) else "" + hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else "" + if not fg_name or not hostname: + continue + + fg_arch = None + fg_base = fg_name + for suffix in ("_x86_64", "_aarch64"): + if fg_name.endswith(suffix): + fg_arch = suffix.lstrip("_") + fg_base = fg_name[: -len(suffix)] + break + + if fg_base in arch_map and fg_arch: + arch_map[fg_base].append((fg_arch, row_idx)) + + if not arch_map["slurm_node"] or not arch_map["login_compiler_node"]: + return + + slurm_arch, _ = arch_map["slurm_node"][0] + login_arch, _ = arch_map["login_compiler_node"][0] + if slurm_arch != login_arch: + slurm_rows = [str(r[1]) for r in arch_map["slurm_node"]] + login_rows = [str(r[1]) for r in arch_map["login_compiler_node"]] + raise ValueError( + "Architecture suffix mismatch between slurm_node and login_compiler_node. " + f"slurm_node suffix '{slurm_arch}' vs " + f"login_compiler_node suffix '{login_arch}' " + "Ensure both use the same suffix (_x86_64 or _aarch64)." + ) + def validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path): """ Validates that HOSTNAME values in the mapping file are unique. @@ -684,6 +743,7 @@ def validate_provision_config( validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path) validate_functional_groups_separation(pxe_mapping_file_path) validate_parent_service_tag_hierarchy(pxe_mapping_file_path) + validate_slurm_login_compiler_prefix(pxe_mapping_file_path) # Validate ADMIN_IPs against network_spec.yml ranges network_spec_path = create_file_path(input_file_path, file_names["network_spec"]) From f89761e9970a944af6efd3ded5a93fb184284ae4 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Mon, 9 Feb 2026 17:26:36 +0530 Subject: [PATCH 086/172] Update omnia.sh --- omnia.sh | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 4 deletions(-) diff --git a/omnia.sh b/omnia.sh index f05c9ebe84..746fb8fd34 100755 --- a/omnia.sh +++ b/omnia.sh @@ -825,6 +825,13 @@ EOF echo "nfs_type: $nfs_type" } >> "$oim_metadata_file" fi + else + sed -i '/^upgrade_backup_dir:/d' "$oim_metadata_file" >/dev/null 2>&1 || true + if grep -q '^omnia_version:' "$oim_metadata_file"; then + sed -i "s/^omnia_version:.*/omnia_version: $omnia_release/" "$oim_metadata_file" >/dev/null 2>&1 || true + else + echo "omnia_version: $omnia_release" >> "$oim_metadata_file" + fi fi # --- Remove old service if exists --- @@ -924,6 +931,7 @@ validate_nfs_server() { } init_ssh_config() { + mkdir -p "$HOME/.ssh" touch $HOME/.ssh/known_hosts # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 # Remove existing entry if it exists @@ -964,6 +972,8 @@ start_container_session() { -------------------------------------------------------------------------------------------------------------------------------------------------- ${NC}" + init_ssh_config + # Entering Omnia-core container ssh omnia_core } @@ -1192,6 +1202,11 @@ phase1_validate() { return 1 fi + if [ "$previous_omnia_version" = "2.1.0.0" ]; then + echo "[ERROR] [ORCHESTRATOR] Upgrade already performed. Current Omnia version is 2.1.0.0. No further upgrade required." + return 1 + fi + if [ "$previous_omnia_version" != "2.0.0.0" ]; then echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version" return 1 @@ -1241,7 +1256,7 @@ phase1_validate() { } phase2_approval() { - local backup_base default_backup_dir + local backup_base default_backup_dir current_omnia_version echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate" echo "============================================" @@ -1256,10 +1271,16 @@ phase2_approval() { echo " - Additional Package Installation" echo "============================================" - default_backup_dir="$CONTAINER_BACKUPS_DIR/upgrade" + current_omnia_version=$(podman exec -u root omnia_core /bin/bash -c "grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r'" 2>/dev/null) + if [ -z "$current_omnia_version" ]; then + echo "[ERROR] [ORCHESTRATOR] Failed to read omnia_version from metadata inside container" + return 1 + fi + + default_backup_dir="$CONTAINER_BACKUPS_DIR/upgrade/version_${current_omnia_version}" backup_base="$default_backup_dir" - echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base" + echo "[INFO] [ORCHESTRATOR] Backup destination (inside omnia_core container): $backup_base" if ! update_metadata_upgrade_backup_dir "$backup_base"; then echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata" @@ -1331,6 +1352,85 @@ phase3_backup_creation() { return 0 } +phase4_container_swap() { + local quadlet_file="/etc/containers/systemd/omnia_core.container" + local i + + echo "[INFO] [ORCHESTRATOR] Phase 4: Container Swap" + + if [ ! -f "$quadlet_file" ]; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Stopping omnia_core 1.0 container" + systemctl stop omnia_core.service >/dev/null 2>&1 || true + + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[WARN] [ORCHESTRATOR] omnia_core still running; forcing stop" + podman stop -t 30 omnia_core >/dev/null 2>&1 || true + fi + + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit" + if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" + return 1 + fi + + if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file" + return 1 + fi + + systemctl daemon-reload || { + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed" + return 1 + } + + systemctl start omnia_core.service || { + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" + return 1 + } + + echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 1.1 health check (60s)" + for i in $(seq 1 60); do + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + break + fi + sleep 1 + done + + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to 2.1.0.0" + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: 2.1.0.0' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed" + return 0 +} + upgrade_omnia_core() { local lock_file="/var/lock/omnia_core_upgrade.lock" local backup_base @@ -1367,7 +1467,14 @@ upgrade_omnia_core() { exit 1 fi - echo "[INFO] [ORCHESTRATOR] Upgrade tasks for container swap are deferred to a follow-up PR" + if ! phase4_container_swap; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" + exit 1 + fi + + echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" + echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base" + start_container_session exit 0 } From e2228b62028af13647bf426924fe10a1a953065c Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Mon, 9 Feb 2026 17:27:34 +0530 Subject: [PATCH 087/172] Revert "Create test.sh" This reverts commit aae7e28ebde2e207e93c97852a7abfd19aebe215. --- test.sh | 1555 ------------------------------------------------------- 1 file changed, 1555 deletions(-) delete mode 100644 test.sh diff --git a/test.sh b/test.sh deleted file mode 100644 index cd1f8e63e7..0000000000 --- a/test.sh +++ /dev/null @@ -1,1555 +0,0 @@ -#!/bin/bash - -# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# This script is used to generate the Omnia core docker image. -# The image is based on Fedora and uses systemd to start all of the necessary -# services. -# -# This script prompts the user for the Omnia shared path and the root -# password. It then checks if the Omnia shared path exists. -# -# The script checks if the ssh key file exists. If it does not exist, a new ssh - -# Color Definitions -RED='\033[0;31m' -GREEN='\033[0;32m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color -YELLOW='\033[0;33m' -omnia_release=2.1.0.0 - -core_container_status=false -omnia_path="" -hashed_passwd="" -domain_name="" - -is_local_ip() { - local ip_to_check="$1" - - # Get all local IP addresses (excluding loopback) - local local_ips - local_ips=$(hostname -I) - - # Check if the IP matches any local IP - if echo "$local_ips" | grep -qw "$ip_to_check"; then - return 0 # IP is local - else - return 1 # IP is not local - fi -} - -OMNIA_BASE_DIR="/opt/omnia" -OMNIA_INPUT_DIR="/opt/omnia/input" -OMNIA_BACKUPS_DIR="/opt/omnia/backups" -OMNIA_METADATA_DIR="/opt/omnia/.data" -OMNIA_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" - -update_metadata_upgrade_backup_dir() { - local backup_dir="$1" - - if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then - echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running" - return 1 - fi - - podman exec -u root omnia_core bash -c " - set -e - if [ ! -f '$OMNIA_METADATA_FILE' ]; then - echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 - exit 1 - fi - if grep -q '^upgrade_backup_dir:' '$OMNIA_METADATA_FILE'; then - sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$OMNIA_METADATA_FILE' - else - echo 'upgrade_backup_dir: ${backup_dir}' >> '$OMNIA_METADATA_FILE' - fi - " -} - - - -check_internal_nfs_export() { - nfs_server_ip=$1 - nfs_server_share_path=$2 - - if is_local_ip "$nfs_server_ip"; then - echo "The provided NFS server IP ($nfs_server_ip) belongs to the current system." - else - echo "The provided NFS server IP ($nfs_server_ip) is NOT the current system's IP." - exit 1 - fi - - # Query the remote server for exports - exports=$(showmount -e "$nfs_server_ip" 2>/dev/null) - - if [[ $? -ne 0 ]]; then - echo -e "${RED}ERROR: Unable to contact NFS server at $nfs_server_ip. Ensure NFS and rpcbind are running, and firewall allows access.${NC}" - exit 1 - fi - - # Check if path is in the export list - if echo "$exports" | awk '{print $1}' | grep -Fxq "$nfs_server_share_path"; then - echo -e "${GREEN}Path $nfs_server_share_path is exported by $nfs_server_ip.${NC}" - else - echo -e "${RED}ERROR: Path $nfs_server_share_path is NOT exported by $nfs_server_ip.${NC}" - exit 1 - fi -} - -display_supported_use_cases() { - # Color definitions - BLUE='\033[1;34m' - YELLOW='\033[1;33m' - GREEN='\033[1;32m' - NC='\033[0m' # No Color - - # Introductory Guidance - echo -e "${BLUE} ----------------- Omnia Shared Path Configuration ---------------- ${NC}" - echo -e "${BLUE} Please choose the type of Omnia shared path in Omnia Infrastructure Manager (OIM): ${NC}" - echo -e "${BLUE} It is recommended to use a external NFS share for the Omnia shared path. ${NC}" - echo -e "${BLUE} If you are not using NFS, make sure enough space is available on the disk. ${NC}" - echo -e "${YELLOW} Using a Extrenal NFS share is mandatory for Omnia shared path if you are planning to have high availability in OIM or require K8s service cluster. ${NC}" - echo -e "\nSupported Use Cases:\n" - - # Table content - { - echo -e "Share Option\tType\tDescription\tAdditional Info" - echo -e "${GREEN}NFS\tExternal\tExternal NFS server(outside OIM) created by user\tMust be reachable from OIM and service nodes. Mounts on OIM. Recommended for HA and hierarchical clusters.${NC}" - echo -e "NFS\tInternal\tNFS server created by user in OIM\tUsed only for flat provisioning. No HA or k8s service cluster support. No mount performed." - echo -e "Local\tDisk\tDisk storage in OIM\tUsed only for flat provisioning. No HA or hierarchical support." - } | column -t -s $'\t' -} - - -# This function is responsible for initializing the Omnia core container -# It prompts the user for the Omnia shared path and the root password. -# It checks if the Omnia shared path exists. -setup_omnia_core() { - # Validate the system environment - validate_oim - - # Initialize the container configuration - init_container_config - - # Setup the container - setup_container - - # Post container setup configuration - post_setup_config - - # Start the container - start_container_session -} - - -# This function is responsible for cleaning up the Omnia core container. -# It removes the container and performs the necessary cleanup steps. -cleanup_omnia_core() { - # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') - if [ -n "$critical_running" ]; then - echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" - echo "$critical_running" - echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}" - exit 1 - fi - - echo -e "${RED} WARNING: This will remove Omnia core container and all files in Omnia Shared Path.${NC}" - echo -e "${GREEN} You can abort and take backup if you want.${NC}" - read -p " Are you sure you want to continue with the cleanup? (y/n): " confirm - if [ "$confirm" = "n" ] || [ "$confirm" = "N" ]; then - echo -e "${GREEN}Aborting.${NC}" - exit 0 - elif [ "$confirm" = "y" ] || [ "$confirm" = "Y" ]; then - - # Fetch the configuration from the Omnia core container. - fetch_config - - # Remove the container - remove_container - - # Perform the necessary cleanup steps - cleanup_config - fi -} - - -# This function is responsible for cleaning up the Omnia core container configuration. -# It removes the public key from the authorized_keys file. -# It removes the private key. -# It removes the ssh key from the known_hosts file. -# It removes the Omnia core configuration. -# -cleanup_config(){ - - # Set the path to the ssh public key. - ssh_key_file="$HOME/.ssh/oim_rsa.pub" - - # Remove the public key from the authorized_keys file. - if [ -f "$ssh_key_file" ]; then - # Remove the line from the authorized_keys file. - sed -i "\|^$(cat $ssh_key_file)$|d" $HOME/.ssh/authorized_keys - echo -e "${GREEN} Public key has been removed from authorized_keys.${NC}" - else - echo -e "${RED} Public key file not found.${NC}" - fi - - # Remove the SSH key pair. - ssh_key_file="$HOME/.ssh/oim_rsa" - ssh_key_file_pub="${ssh_key_file}.pub" - if [ -f "$ssh_key_file" ] && [ -f "$ssh_key_file_pub" ]; then - rm -f "$ssh_key_file" "$ssh_key_file_pub" - echo -e "${GREEN} SSH key pair have been removed.${NC}" - else - echo -e "${RED} SSH key file not found.${NC}" - fi - - # Remove the ssh key from the known_hosts file. - echo -e "${BLUE} Removing ssh key from known_hosts file.${NC}" - ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 - - - # Remove the host entry from the config file in .ssh folder. - ssh_config_file="$HOME/.ssh/config" - if [ -f "$ssh_config_file" ]; then - sed -i '/Host omnia_core/,+5d' "$ssh_config_file" - echo -e "${GREEN} Host entry has been removed from config file.${NC}" - else - echo -e "${RED} Config file not found.${NC}" - fi - - # Remove the Omnia core configuration. - echo -e "${BLUE} Removing Omnia core configuration.${NC}" - rm -rf $omnia_path/omnia/{hosts,input,log,pulp,provision,pcs,ssh_config,tmp,.data} - - # Unmount the NFS shared path if the share option is NFS. - if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then - umount "$omnia_path" - if [ $? -eq 0 ]; then - echo -e "${GREEN} NFS shared path has been unmounted.${NC}" - else - echo -e "${RED} Failed to unmount NFS shared path.${NC}" - fi - # Remove the entry from /etc/fstab - fstab_file="/etc/fstab" - if [ -f "$fstab_file" ]; then - # Create a backup of the fstab file. - cp "$fstab_file" "$fstab_file.bak" - - # Remove the line from the fstab file. - sed -i "\#$omnia_path#d" "$fstab_file" - if [ $? -ne 0 ]; then - echo -e "${RED} Failed to remove the entry from /etc/fstab.${NC}" - fi - fi - fi - - echo -e "${GREEN} Omnia core configuration has been cleaned up.${NC}" -} - -# This function is responsible for removing the Omnia core container. -# -# It removes the container using the 'podman rm -f' command. -# If the container is removed successfully, it prints a success message. -# Otherwise, it prints an error message. -remove_container() { - # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') - if [ -n "$critical_running" ]; then - echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" - echo "$critical_running" - echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}" - exit 1 - fi - - # Remove the container. - echo -e "${BLUE} Removing the Omnia core container.${NC}" - if systemctl stop omnia_core.service; then - echo -e "${GREEN} Omnia core container has been removed.${NC}" - # Remove the systemd generator symlinks. - echo -e "${GREEN} Cleaning up systemd generator symlinks.${NC}" - rm -f /run/systemd/generator/omnia_core.service - rm -f /run/systemd/generator/multi-user.target.wants/omnia_core.service - rm -f /run/systemd/generator/default.target.wants/omnia_core.service - - echo -e "${GREEN} Cleaning up omnia_core.container.${NC}" - rm -f /etc/containers/systemd/omnia_core.container - - # Remove the omnia_core.service file. - rm -f /etc/systemd/system/omnia_core.service - systemctl daemon-reload - systemctl reset-failed omnia_core.service - # check if service is removed - if systemctl status omnia_core.service >/dev/null 2>&1; then - echo -e "${RED} Failed to remove Omnia core service.${NC}" - else - echo -e "${GREEN} Omnia core service has been removed.${NC}" - fi - else - echo -e "${RED} Failed to remove Omnia core container.${NC}" - fi - - # Remove the container image. - # if podman rmi omnia_core; then - # echo -e "${GREEN} Omnia core image has been removed.${NC}" - # else - # echo -e "${RED} Failed to remove Omnia core image.${NC}" - # fi -} - - -# This function is responsible for initializing the Omnia core container. -# -# It prompts the user for the Omnia shared path and the root -# password. It then checks if the Omnia shared path exists. -# -# The function generates the ssh key pair and copies the private -# key to the Omnia shared path. -# -# The function also copies the ssh public key to the -# authorized_keys file. -# -# The function creates the necessary log directories. -init_container_config() { - - share_option="" - # Display the supported use cases - display_supported_use_cases - - # Display the choices for the user - echo -e "${BLUE} Choose the type of Omnia shared path:${NC}" - options=( "NFS (recommended)" "Local" ) - - PS3="Select the option number: " - - select opt in "${options[@]}"; do - case $opt in - "NFS (recommended)") - share_option="NFS" - break - ;; - "Local") - share_option="Local" - break - ;; - *) - echo -e "${RED} Invalid option.${NC}" - continue - esac - done - - case $share_option in - "Local") - # Prompt the user for the Omnia shared path. - echo -e "${BLUE} Please provide Omnia shared path:${NC}" - read -p "Omnia shared path: " omnia_path - - # Check if the Omnia shared path is absolute path and path exists. - if [[ "$omnia_path" != /* ]] || [ ! -d "$omnia_path" ]; then - echo -e "${RED} Omnia shared path is not an absolute path or does not exist! Please re-run omnia.sh --install with valid Omnia shared path.${NC}" - exit 1 - fi - ;; - "NFS") - echo -e "${BLUE} Select NFS type:${NC}" - select nfs_type in "External (Recommended)" "Internal"; do - case $nfs_type in - "External (Recommended)") - echo -e "${BLUE} Please provide the external NFS server IP:${NC}" - read -p "External NFS server IP: " nfs_server_ip - - echo -e "${BLUE} Please provide the external NFS server share path:${NC}" - read -p "External NFS share path: " nfs_server_share_path - - echo -e "${BLUE} Please provide the OIM client share path (mount target):${NC}" - read -p "Omnia shared path: " omnia_path - - # Validate Omnia shared path is absolute - if [[ "$omnia_path" != /* ]]; then - echo -e "${RED}Omnia shared path must be an absolute path.${NC}" - exit 1 - fi - - nfs_type="external" - break - ;; - "Internal") - echo -e "${BLUE} Please provide the OIM server IP:${NC}" - read -p "OIM server IP: " nfs_server_ip - - echo -e "${BLUE} Please provide the OIM server share path:${NC}" - read -p "OIM server share path: " nfs_server_share_path - - echo -e "${BLUE} Checking if the OIM server share path is mounted${NC}" - check_internal_nfs_export "$nfs_server_ip" "$nfs_server_share_path" - - # Note: No mounting performed here - echo -e "${YELLOW}Note: Internal NFS does not support HA OIM or hierarchical cluster. Proceeding...${NC}" - nfs_type="internal" - omnia_path="$nfs_server_share_path" - break - ;; - *) - echo -e "${RED}Invalid option. Please choose 1 or 2.${NC}" - ;; - esac - done - ;; - esac - - - # Prompt the user for the Omnia core root password. - echo -e "${BLUE} Please provide Omnia core root password for accessing container:${NC}" - - read -p " Enter: " -s passwd - - # Prompt the user for the Omnia core root password confirmation. - echo -e "\n${BLUE} Please confirm password:${NC}" - read -s -p " Enter: " cnf_passwd - - # Check if the provided passwords match. - if [ "$passwd" != "$cnf_passwd" ]; then - echo -e "${RED} Invalid Omnia core root password, passwords do not match!${NC}" - exit 1 - fi - - # Check if the password contains any of the invalid characters - invalid_chars='[\\|&;`"><*?!$(){}[\]]' - if [[ "$passwd" =~ $invalid_chars ]]; then - echo -e "${RED} Invalid password, passwords must not contain any of these special characters: [\\|&;\`\"><*?!$(){}[\]]${NC}" - exit 1 - fi - - # Install NFS client package if option NFS is selected - if [[ "$share_option" == "NFS" ]]; then - # Install NFS client package - echo -e "${BLUE} Installing NFS client package.${NC}" - dnf install -y nfs-utils nfs4-acl-tools - - # Create omnia_path directory if it does not exist - echo -e "${BLUE} Creating omnia shared path directory if it does not exist.${NC}" - mkdir -p $omnia_path - - # Mount NFS server share path in Omnia share path - if [[ "$nfs_type" == "external" ]]; then - - if is_local_ip "$nfs_server_ip"; then - echo -e "${RED} Error: NFS server $nfs_server_ip is a local IP.${NC}" - echo -e "${RED} Please provide an external NFS server IP or re-run omnia.sh --install with valid options.${NC}" - exit 1 - fi - - # Validate if NFS server is reachable - echo -e "${BLUE} Validating if NFS server is reachable.${NC}" - ping -c1 -W1 $nfs_server_ip > /dev/null - if [ $? -ne 0 ]; then - echo -e "${RED} NFS server $nfs_server_ip is not reachable.${NC}" - exit 1 - fi - - echo -e "${BLUE} Mounting NFS server share path in Omnia share path.${NC}" - mount -t nfs -o nosuid,rw,sync,hard,intr,timeo=30 "$nfs_server_ip:$nfs_server_share_path" "$omnia_path" - if [[ $? -ne 0 ]]; then - echo -e "${RED} Failed to mount NFS. Please check the IP and path.${NC}" - exit 1 - fi - # Validate if NFS server share path is mounted - echo -e "${BLUE} Validating if NFS server share path is mounted.${NC}" - # strip the trailing slash from nfs_server_share_path - nfs_server_share_path="${nfs_server_share_path%/}" - if grep -qs "$nfs_server_ip:$nfs_server_share_path" /proc/mounts; then - echo -e "${GREEN} NFS server share path is mounted.${NC}" - else - echo -e "${RED} NFS server share path is not mounted. Provide valid NFS server details. ${NC}" - exit 1 - fi - # Add NFS server share to /etc/fstab to mount on startup - echo "$nfs_server_ip:$nfs_server_share_path $omnia_path nfs nosuid,rw,sync,hard,intr" >> /etc/fstab - else - echo -e "${BLUE} Using internal NFS path without mounting.${NC}" - fi - - fi - - hashed_passwd=$(openssl passwd -1 $passwd) - ssh_key_file="/root/.ssh/oim_rsa" - ssh_port=2222 - - # Generate a new ssh key pair. - if [ -f "$ssh_key_file" ]; then - echo -e "\n${BLUE} Skipping generating new ssh key pair.${NC}" - else - echo -e "\n${GREEN} Generating a new ssh key pair.${NC}" - ssh-keygen -t rsa -b 4096 -C "omnia_oim" -q -N '' -f /root/.ssh/oim_rsa - { - echo "Host omnia_core" - echo " Hostname localhost" - echo " Port $ssh_port" - echo " User root" - echo " IdentityFile ~/.ssh/oim_rsa" - echo " IdentitiesOnly yes" - } >> $HOME/.ssh/config - fi - - # Create the ssh configuration directory if it does not exist. - echo -e "${GREEN} Creating the ssh configuration directory if it does not exist.${NC}" - mkdir -p "$omnia_path/omnia/ssh_config/.ssh" - - # Copy the omnia_core ssh config to the shared path. - echo -e "${GREEN} Copying the omnia_core ssh config to the omnia shared path.${NC}" - cp "$HOME/.ssh/config" "$omnia_path/omnia/ssh_config/.ssh/config" - - # Copy the oim_rsa ssh key to the shared path. - echo -e "${GREEN} Copying the oim_rsa ssh key to the omnia shared path.${NC}" - cp "$HOME/.ssh/oim_rsa" "$omnia_path/omnia/ssh_config/.ssh/oim_rsa" - - # Copy the ssh private key to the omnia shared path. - echo -e "${GREEN} Copying the ssh private key to the omnia shared path.${NC}" - cp $ssh_key_file "$omnia_path/omnia/ssh_config/.ssh/id_rsa" - - # Copy the ssh public key to the omnia shared path. - echo -e "${GREEN} Copying the ssh public key to the omnia shared path.${NC}" - cp $ssh_key_file.pub "$omnia_path/omnia/ssh_config/.ssh/id_rsa.pub" - - # Get the ssh public key. - ssh_public_key="$(cat /root/.ssh/oim_rsa.pub)" - - validate_nfs_server - - # Add ssh public key to the authorized_keys. - echo -e "${GREEN} Adding ssh public key to the authorized_keys.${NC}" - if grep -q "$ssh_public_key" $HOME/.ssh/authorized_keys; then - echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys.${NC}" - else - echo "$ssh_public_key" >> $HOME/.ssh/authorized_keys - chmod 600 $HOME/.ssh/authorized_keys - fi - - # Add ssh public key to the authorized_keys in the ssh_config directory. - echo -e "${GREEN} Adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}" - if [ -f "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" ] && grep -q "$ssh_public_key" "$omnia_path/omnia/ssh_config/.ssh/authorized_keys"; then - echo -e "${BLUE} Skipping adding ssh public key to the authorized_keys in the Omnia ssh_config directory.${NC}" - else - echo "$ssh_public_key" >> "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" - chmod 600 "$omnia_path/omnia/ssh_config/.ssh/authorized_keys" - fi - - # Create the log directory if it does not exist. - echo -e "${GREEN} Creating the log directory if it does not exist.${NC}" - mkdir -p "$omnia_path/omnia/log/core/container" - mkdir -p "$omnia_path/omnia/log/core/playbooks" - - # Create the hosts file for cluster in $omnia_path/omnia/hosts - echo -e "${GREEN} Creating the hosts file for cluster.${NC}" - touch "$omnia_path/omnia/hosts" - - # Create the pulp_ha directory if it does not exist. - echo -e "${GREEN} Creating the pulp HA directory if it does not exist.${NC}" - mkdir -p "$omnia_path/omnia/pulp/pulp_ha" -} - - -# This function is responsible for fetching the configuration from the Omnia core. -# It uses podman exec to run a command in the Omnia core container. -# The command retrieves the metadata from the oim_metadata.yml file. -# The metadata is then parsed and the required configuration is extracted. -fetch_config() { - - # Fetch the metadata from the oim_metadata.yml file. - echo -e "${GREEN} Fetching the metadata from the oim_metadata.yml file.${NC}" - core_config=$(podman exec -ti omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml') - - # Split the metadata into separate lines. - IFS=$'\n' read -r -d '' -a config_lines <<<"$core_config" - - # Loop through the lines and extract the required configuration. - for line in "${config_lines[@]}"; do - # Extract the key and value from the line. - key=$(echo "$line" | awk -F ':' '{print $1}') - value=$(echo "$line" | awk -F ':' '{print $2}') - - # Check the key and assign the value to the corresponding variable. - case $key in - oim_shared_path) - # Assign the shared path. - omnia_path=$(echo "$value" | tr -d '[:space:]') - ;; - omnia_core_hashed_passwd) - # Assign the hashed password. - hashed_passwd=$(echo "$value" | tr -d '[:space:]') - ;; - nfs_server_ip) - # Assign the nfs server ip. - nfs_server_ip=$(echo "$value" | tr -d '[:space:]') - ;; - nfs_server_share_path) - # Assign the nfs server share path. - nfs_server_share_path=$(echo "$value" | tr -d '[:space:]') - ;; - omnia_share_option) - # Assign the share option. - share_option=$(echo "$value" | tr -d '[:space:]') - ;; - nfs_type) - # Assign the share option. - nfs_type=$(echo "$value" | tr -d '[:space:]') - ;; - esac - done - # Check if the required configuration is extracted successfully. - if [ -z "$omnia_path" ] || [ -z "$hashed_passwd" ]; then - echo -e "${RED} Failed to fetch data from metadata file.${NC}" - exit 1 - else - echo -e "${GREEN} Successfully fetched data from metadata file.${NC}" - fi -} - -# Validates the OIM (Omnia Infrastructure Manager) by checking if the hostname is -# configured with a domain name, checking if Podman is installed, enabling and -# starting the Podman socket. -validate_oim() { - # Check if the hostname is set - hostname_value=$(hostname) - if [[ -z "$hostname_value" ]]; then - echo -e "${RED}Hostname is not set!${NC}" - exit 1 - fi - - # Check if the hostname is static - static_hostname=$(hostnamectl --static) - current_hostname=$(hostname) - if [[ "$static_hostname" != "$current_hostname" ]]; then - echo -e "${RED}Static Hostname is unset. Current: '$current_hostname', Static: '$static_hostname'${NC}" - echo -e "${RED}Please set the static hostname and try again.${NC}" - echo -e "${BLUE}Command to set hostname: hostnamectl set-hostname ${NC}" - echo -e "${RED}Exiting...${NC}" - exit 1 - fi - - # Check if the hostname is configured with a domain name. - domain_name=$(hostname -d) - if [[ -n "$domain_name" ]]; then - echo -e "${BLUE}Hostname is configured with a domain name: $domain_name${NC}" - else - echo -e "${RED}Invalid hostname, hostname is not configured with a domain name!${NC}" - exit 1 - fi - - # Detect OIM timezone from systemd in a stable, case‑independent way - oim_timezone=$(timedatectl show -p Timezone --value 2>/dev/null) - - # Fallbacks if needed (non‑systemd or old timedatectl) - if [[ -z "$oim_timezone" ]]; then - if [[ -f /etc/timezone ]]; then - # Debian/Ubuntu style - oim_timezone=$(< /etc/timezone) - elif [[ -L /etc/localtime ]]; then - # Derive from /etc/localtime symlink - oim_timezone=$(readlink -f /etc/localtime | sed -n 's|^.*zoneinfo/||p') - fi - fi - - podman --version - - # Capture the exit status - if [ $? -eq 0 ]; then - echo -e "${BLUE} Podman is installed. Version: $(podman --version)${NC}" - else - echo -e "${RED} Podman is not installed.${NC}" - exit 1 - fi - - # Enable the podman socket to start at boot - echo -e "${BLUE} Enabling podman.socket...${NC}" - systemctl enable podman.socket - - # Start the podman socket now - echo -e "${BLUE} Starting podman.socket...${NC}" - systemctl start podman.socket - - # Print a success message after enabling and starting the podman socket - echo -e "${GREEN} Podman socket has been enabled and started.${NC}" -} - -# Checks if the required directories for Omnia are present. -# This function iterates over a list of required directories/files and checks if each one exists. -check_required_directories() { - required_paths=( - "$omnia_path/omnia" - "$omnia_path/omnia/ssh_config/.ssh" - "$omnia_path/omnia/log/core/container" - "$omnia_path/omnia/hosts" - "$omnia_path/omnia/pulp/pulp_ha" - ) - - missing_paths=() - - for path in "${required_paths[@]}"; do - if [ ! -e "$path" ]; then # Checks both files and directories - missing_paths+=("$path") - fi - done - - if [ "${#missing_paths[@]}" -ne 0 ]; then - echo -e "${RED}Error: The following required files or directories are missing:${NC}" - echo -e "${RED}${missing_paths[*]}${NC}" - echo -e "User can not Retain Existing configuration" - echo - echo -e "${YELLOW}Instructions:${NC}" - echo -e "${YELLOW}* Backup any existing files if required${NC}" - echo -e "${YELLOW}* Run ./omnia.sh --install and choose:${NC}" - echo -e "${YELLOW} Options:${NC}" - echo -e "${YELLOW} -> Reinstall the container${NC}" - echo -e "${YELLOW} -> Overwrite and create new configuration${NC}" - exit 1 - fi -} - -# Sets up the Omnia core container. -# This function pulls the Omnia core Podman image and runs the container. -# Creates a Quadlet service for the container and also creates a metadata file. -# It defines the container options and runs the container. -setup_container() { - container_name="omnia_core" - echo "==> Setting up $container_name container" - - # SELinux option handling - selinux_option=":z" - if [ "$share_option" = "NFS" ] && [ "$nfs_type" = "external" ]; then - selinux_option="" - fi - - # Check if RHEL subscription is enabled - subscription_enabled=false - if [ -d "/etc/pki/entitlement" ] && [ "$(ls -A /etc/pki/entitlement/*.pem 2>/dev/null)" ]; then - subscription_enabled=true - fi - - # --- Generate Quadlet container file --- - cat > /etc/containers/systemd/${container_name}.container <> /etc/containers/systemd/${container_name}.container <> /etc/containers/systemd/${container_name}.container <> "$oim_metadata_file" - if [ "$share_option" = "NFS" ]; then - { - echo "nfs_server_ip: $nfs_server_ip" - echo "nfs_server_share_path: $nfs_server_share_path" - echo "nfs_type: $nfs_type" - } >> "$oim_metadata_file" - fi - fi - - # --- Remove old service if exists --- - if systemctl list-unit-files | grep -q "${container_name}.service"; then - systemctl stop ${container_name}.service - systemctl disable ${container_name}.service - rm -f /etc/systemd/system/${container_name}.service - fi - - # --- Reload systemd so Quadlet generates the service --- - systemctl daemon-reexec - systemctl daemon-reload - systemctl start ${container_name}.service - - # --- Start the container via Quadlet --- - echo "==> ${container_name} container deployed and starting via Quadlet" - - # --- Wait for container to be running --- - echo "Waiting for $container_name container to start..." - for i in {1..30}; do - if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then - echo "$container_name container is running." - break - else - sleep 1 - fi - done - - if ! podman ps --format '{{.Names}}' | grep -qw "$container_name"; then - echo -e "${RED}Error: $container_name container failed to start.${NC}" - rm -rf "$OMNIA_METADATA_FILE" - exit 1 - fi - - systemctl start firewalld - systemctl enable firewalld - firewall-cmd --permanent --zone=public --add-port=2222/tcp - firewall-cmd --reload -} - -# This function sets up the configuration for the Omnia core. -# post_setup_config is a function that sets up the configuration for the Omnia core. -# It creates the necessary directories and files, copies input files from the Omnia container, -# and creates the oim_metadata.yml file. -post_setup_config() { - - # Create the ansible tmp directory if it does not exist. - mkdir -p "$omnia_path/omnia/tmp/.ansible/tmp" - chmod 757 "$omnia_path/omnia/tmp/.ansible/tmp" - # Create the input directory if it does not exist. - echo -e "${GREEN} Creating the input directory if it does not exist.${NC}" - mkdir -p "$OMNIA_INPUT_DIR/" - - # Create the default.yml file if it does not exist. - # This file contains the name of the project. - if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then - echo -e "${BLUE} Creating default.yml file.${NC}" - { - echo "# This file defines the project name." - echo "# The name of the project should be set in a directory under input." - echo "project_name: project_default" - } >> "$OMNIA_INPUT_DIR/default.yml" - fi - - # Copy input files from /omnia to /opt/omnia/project_default/ inside omnia_core container - podman exec -u root omnia_core bash -c "cd /omnia && git pull" - echo -e "${BLUE} Moving input files from /omnia dir to project_default folder.${NC}" - podman exec -u root omnia_core bash -c " - mkdir -p /opt/omnia/input/project_default - cp -r /omnia/input/* /opt/omnia/input/project_default - rm -rf /omnia/input - rm -rf /omnia/omnia.sh" - - init_ssh_config -} - -validate_nfs_server() { - - # Validate NFS server permission - if [ "$share_option" = "NFS" ]; then - # Create a temporary file inside $omnia_path - temp_file="$omnia_path/temp_file" - touch "$temp_file" - # Check if the file can be chown to root - if chown root:root "$temp_file"; then - rm "$temp_file" - else - echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration." - exit 1 - fi - if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then - echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration." - exit 1 - fi - fi - -} - -init_ssh_config() { - touch $HOME/.ssh/known_hosts - # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host - ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 # Remove existing entry if it exists - ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts # Scan and add the new key -} - -start_container_session() { - - echo -e "${GREEN} - ------------------------------------------------------------------------------------------------------------------------------------------ - Omnia Core container running successfully. - - Entering the container from Omnia Infrastructure Manager(OIM): - Through podman: - # podman exec -it -u root omnia_core bash - - Direct SSH: - # ssh omnia_core - - You are now in the Omnia environment. - - The following are the main directories available in the Omnia core container: - - - The shared directory, which is mapped to $omnia_path in OIM: /opt/omnia - - The input directory: /opt/omnia/input - - The Omnia source code directory: /omnia - - The Omnia playbooks logs directory: /opt/omnia/log/core/playbooks - - It's important to note: - - Files placed in the shared directory should not be manually deleted. - - Use the playbook /omnia/utils/oim_cleanup.yml to safely remove the shared directory and Omnia containers (except the core container). - - If you need to delete the core container, please run the omnia.sh script with --uninstall option. - - If you need to redeploy the core container with new input configs, please rerun the omnia.sh script with --install option. - - Provide any file paths (ISO, mapping files, etc.) that are mentioned in input files in the /opt/omnia directory. - - The domain name that will be used for Omnia is $domain_name, if you wish to change the domain name please cleanup Omnia, - change the Omnia Infrastructure Manager's domain name and rerun omnia.sh script with --install option. - - -------------------------------------------------------------------------------------------------------------------------------------------------- - ${NC}" - - # Entering Omnia-core container - ssh omnia_core -} - -show_help() { - echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]" - echo " -i, --install Install and start the Omnia core container" - echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" - echo " --upgrade Upgrade the Omnia core container from image tag 1.0 to 1.1" - echo " -v, --version Display Omnia version information" - echo " -h, --help More information about usage" -} - -install_omnia_core() { - local omnia_core_tag="1.1" - local omnia_core_registry="" - - # Check if local omnia_core:1.1 exists - if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" - # Check if latest exists for backward compatibility - elif podman inspect omnia_core:latest >/dev/null 2>&1; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}" - # Tag it as 1.1 for consistency - podman tag omnia_core:latest omnia_core:${omnia_core_tag} - else - echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}" - echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}" - echo "" - echo -e "${YELLOW}One way to build the image locally:${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core omnia_branch=" - echo "" - echo -e "${YELLOW}Then re-run:${NC}" - echo -e " ./omnia.sh --install" - exit 1 - fi - - # Check if any other containers with 'omnia' in their name are running - other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core') - - # If there are any, exit - if [ -n "$other_containers" ]; then - echo -e "${RED} Failed to intiatiate omnia_core container cleanup. There are other omnia container running.${NC}" - echo -e "${GREEN} Execute oim_cleanup.yml first to cleanup all containers.${NC}" - ssh omnia_core - exit 1 - fi - - # Check if the omnia_core container is already running - running_containers=$(podman ps -a --format '{{.Names}} {{.State}}' | grep -E 'omnia_core') - - # If yes, set the variable to true - if [ -n "$running_containers" ]; then - core_container_status=true - fi - - # If core container is running - if [ "$core_container_status" = true ]; then - if [ -n "$(echo "$running_containers" | grep -E 'running')" ]; then - echo -e "${GREEN} Omnia core container is already running.${NC}" - echo -e "${GREEN} Do you want to:${NC}" - PS3="Select the option number: " - - select opt in "Enter omnia_core container" "Reinstall the container" "Exit"; do - case $opt in - "Enter omnia_core container") - choice=1 - break - ;; - "Reinstall the container") - choice=2 - break - ;; - "Exit") - echo "Exiting the script." - exit 0 - ;; - *) - echo "Invalid choice. Please try again." - continue - ;; - esac - done - - # If the user wants to enter omnia_core container - if [ "$choice" = "1" ]; then - start_container_session - fi - # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function - if [ "$choice" = "2" ]; then - # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') - if [ -n "$critical_running" ]; then - echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" - echo "$critical_running" - echo -e "${GREEN}Run oim_cleanup.yml first to cleanup all containers.${NC}" - exit 1 - fi - echo -e "${GREEN} What configuration do you want to use for reinstallation:${NC}" - - PS3="Select the option number: " - - select opt in "Retain Existing configuration" "Overwrite and create new configuration" "Exit"; do - case $opt in - "Retain Existing configuration") - choice=1 - break - ;; - "Overwrite and create new configuration") - choice=2 - break - ;; - "Exit") - echo "Exiting the script." - exit 0 - ;; - *) - echo "Invalid choice. Please try again." - continue - ;; - esac - done - - # If the user wants to retain existing configuration, call the remove_container function - if [ "$choice" = "1" ]; then - fetch_config - check_required_directories - remove_container - setup_container - init_ssh_config - start_container_session - # If the user wants to overwrite and create new configuration, call the cleanup_omnia_core function - elif [ "$choice" = "2" ]; then - cleanup_omnia_core - setup_omnia_core - fi - fi - else - # If omnia_core container exists and is not running call the remove_container function - - echo -e "${RED} The Omnia Core container is present but not in running state.${NC}" - echo -e "${GREEN} Only the core container can be cleanup can be performed.${NC}" - echo -e "${GREEN} Container Configurations in the shared directory will not be cleaned up.${NC}" - echo -e "${GREEN} Do you want to perform cleanup:${NC}" - echo -e "${GREEN} 1. Yes.${NC}" - echo -e "${GREEN} 2. No. ${NC}" - read -p " Enter your choice (1 or 2): " choice - if [ "$choice" = "1" ]; then - remove_container - elif [ "$choice" = "2" ]; then - exit - fi - fi - - # If core container is not present - else - - # Start the container setup - echo -e "${GREEN}Starting Omnia core container setup.${NC}" - setup_omnia_core - fi -} - -# Check if Omnia core container is running -check_container_status() { - # Check if the Omnia core container is running - if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then - echo -e "${RED}ERROR: Omnia core container is not running.${NC}" - exit 1 - fi -} - -# Function to display version information -display_version() { - # Check if metadata file exists and Omnia core container is running - check_container_status - - # Fetch the metadata from the oim_metadata.yml file in the container - echo -e "${GREEN} Fetching metadata from omnia_core container...${NC}" - core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml') - - # Extract Omnia version from metadata file - omnia_version=$(echo "$core_config" | grep "omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r') - - # Display version information - echo "Omnia version: $omnia_version" - - # Return exit code 0 on success - exit 0 -} - -phase1_validate() { - local current_image - local core_config - local previous_omnia_version - local shared_path - - echo "[INFO] [ORCHESTRATOR] Phase 1: Pre-Upgrade Validation" - - if [ "$(id -u)" -ne 0 ]; then - if ! sudo -n true >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: run as root or configure passwordless sudo" - return 1 - fi - fi - - if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then - echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running" - return 1 - fi - - core_config=$(podman exec omnia_core /bin/bash -c 'cat /opt/omnia/.data/oim_metadata.yml' 2>/dev/null) - if [ -z "$core_config" ]; then - echo "[ERROR] [ORCHESTRATOR] Unable to read oim_metadata.yml from omnia_core container" - return 1 - fi - - previous_omnia_version=$(echo "$core_config" | grep "^omnia_version:" | cut -d':' -f2 | tr -d ' \t\n\r') - if [ -z "$previous_omnia_version" ]; then - echo "[ERROR] [ORCHESTRATOR] omnia_version not found in oim_metadata.yml" - return 1 - fi - - if [ "$previous_omnia_version" != "2.0.0.0" ]; then - echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version" - return 1 - fi - - shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r') - if [ -z "$shared_path" ]; then - echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml" - return 1 - fi - - omnia_path="$shared_path" - - if [ ! -d "$omnia_path" ]; then - echo "[ERROR] [ORCHESTRATOR] Shared path from metadata does not exist on host: $omnia_path" - return 1 - fi - - if [ ! -w "$omnia_path" ]; then - echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on shared path: $omnia_path" - return 1 - fi - - current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null) - if [ -z "$current_image" ]; then - echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image" - return 1 - fi - - if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then - echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image" - return 1 - fi - - echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0)" - - if [ ! -d "$OMNIA_BASE_DIR" ]; then - echo "[ERROR] [ORCHESTRATOR] Mount/path invalid: expected directory not found: $OMNIA_BASE_DIR" - echo "[ERROR] [ORCHESTRATOR] Fix: ensure /opt/omnia exists and is mounted (if using external mount)" - return 1 - fi - - if [ ! -w "$OMNIA_BASE_DIR" ]; then - echo "[ERROR] [ORCHESTRATOR] Permission denied: no write permission on $OMNIA_BASE_DIR" - echo "[ERROR] [ORCHESTRATOR] Fix: run as root or fix permissions on /opt/omnia" - return 1 - fi - - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" - echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." - return 1 - fi - - echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed" - return 0 -} - -phase2_approval() { - local backup_base default_backup_dir - - echo "[INFO] [ORCHESTRATOR] Phase 2: Approval Gate" - echo "============================================" - echo "OMNIA UPGRADE SUMMARY" - echo "============================================" - echo "Current Container Tag: 1.0" - echo "Target Container Tag: 1.1" - echo "Current Omnia Release: 2.0.0.0" - echo "Target Omnia Release: 2.1.0.0" - echo "New Features:" - echo " - Add and remove node for slurm cluster" - echo " - Additional Package Installation" - echo "============================================" - - default_backup_dir="$OMNIA_BACKUPS_DIR/upgrade" - backup_base="$default_backup_dir" - - echo "[INFO] [ORCHESTRATOR] Backup destination: $backup_base" - - if ! update_metadata_upgrade_backup_dir "$backup_base"; then - echo "[ERROR] [ORCHESTRATOR] Failed to update upgrade backup directory in metadata" - return 1 - fi - - read -p "Proceed with upgrade? (y/N): " confirm - if [ "$confirm" != "y" ] && [ "$confirm" != "Y" ]; then - echo "[INFO] [ORCHESTRATOR] Upgrade cancelled by user" - return 1 - fi - - OMNIA_UPGRADE_BACKUP_PATH="$backup_base" - export OMNIA_UPGRADE_BACKUP_PATH - - echo "[INFO] [ORCHESTRATOR] Phase 2: Approval granted" - return 0 -} - -generate_backup_manifest() { - local backup_path="$1" - local manifest_file="$backup_path/manifest.txt" - - { - echo "backup_version: 1.0" - echo "timestamp: $(date -Iseconds)" - echo "source_container_tag: 1.0" - echo "target_container_tag: 1.1" - echo "source_omnia_release: 2.0.x" - echo "target_omnia_release: 2.1.0.0" - echo "hostname: $(hostname)" - echo "" - echo "files:" - find "$backup_path" -type f ! -name "manifest.txt" -exec echo " - {}" \; - } > "$manifest_file" -} - -verify_backup_integrity() { - local backup_path="$1" - - [ -d "$backup_path" ] || return 1 - [ -d "$backup_path/input" ] || return 1 - [ -d "$backup_path/metadata" ] || return 1 - [ -d "$backup_path/configs" ] || return 1 - [ -f "$backup_path/metadata/oim_metadata.yml" ] || return 1 - [ -f "$backup_path/manifest.txt" ] || return 1 - - return 0 -} - -create_backup() { - local backup_path="$1" - - echo "[INFO] [ORCHESTRATOR] Phase 3: Backup Creation" - - if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then - echo "[ERROR] [ORCHESTRATOR] Cannot create backup because omnia_core is not running" - return 1 - fi - - if ! podman exec -u root omnia_core bash -c " - set -e - mkdir -p '$backup_path/input' '$backup_path/metadata' '$backup_path/configs' - - if [ -d '$OMNIA_INPUT_DIR' ]; then - cp -a '$OMNIA_INPUT_DIR' '$backup_path/' - fi - - if [ ! -f '$OMNIA_METADATA_FILE' ]; then - echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 - exit 1 - fi - cp -a '$OMNIA_METADATA_FILE' '$backup_path/metadata/oim_metadata.yml' - - ts=\"\$(date -Iseconds)\" - hn=\"\$(hostname)\" - { - echo 'backup_version: 1.0' - echo \"timestamp: \$ts\" - echo 'source_container_tag: 1.0' - echo 'target_container_tag: 1.1' - echo 'source_omnia_release: 2.0.x' - echo 'target_omnia_release: 2.1.0.0' - echo \"hostname: \$hn\" - } > '$backup_path/manifest.txt' - "; then - echo "[ERROR] [ORCHESTRATOR] Failed to create backup inside omnia_core container" - return 1 - fi - - if [ -f "/etc/containers/systemd/omnia_core.container" ]; then - if ! podman cp "/etc/containers/systemd/omnia_core.container" "omnia_core:$backup_path/configs/omnia_core.container" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Failed to backup quadlet container file into container backup path" - return 1 - fi - fi - - echo "[INFO] [ORCHESTRATOR] Backup created at: $backup_path" - echo "[INFO] [ORCHESTRATOR] Phase 3: Backup completed" - return 0 -} - -wait_for_container_health() { - local timeout="${1:-60}" - local i - - for i in $(seq 1 "$timeout"); do - if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then - return 0 - fi - sleep 1 - done - return 1 -} - -update_metadata_version() { - local metadata_file="$OMNIA_METADATA_FILE" - - if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then - echo "[ERROR] [ORCHESTRATOR] omnia_core container is not running" - return 1 - fi - - podman exec -u root omnia_core bash -c " - set -e - if [ ! -f '$OMNIA_METADATA_FILE' ]; then - echo '[ERROR] Metadata file not found inside container: $OMNIA_METADATA_FILE' >&2 - exit 1 - fi - if grep -q '^omnia_version:' '$OMNIA_METADATA_FILE'; then - sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$OMNIA_METADATA_FILE' - else - echo 'omnia_version: 2.1.0.0' >> '$OMNIA_METADATA_FILE' - fi - " -} - -sync_input_to_shared_path() { - if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then - echo "[ERROR] [ORCHESTRATOR] Cannot sync input because omnia_core is not running" - return 1 - fi - - if ! podman exec -u root omnia_core bash -c " - set -e - if [ -d /omnia/input ]; then - mkdir -p /opt/omnia/input/project_default - cp -r /omnia/input/* /opt/omnia/input/project_default - rm -rf /omnia/input - fi - "; then - echo "[ERROR] [ORCHESTRATOR] Failed to copy /omnia/input to /opt/omnia/input/project_default" - return 1 - fi - return 0 -} - -phase4_container_swap() { - echo "[INFO] [ORCHESTRATOR] Phase 4: Container Swap" - - if systemctl list-unit-files | grep -q "omnia_core.service"; then - systemctl stop omnia_core.service >/dev/null 2>&1 || true - fi - - if [ -z "${omnia_path}" ]; then - echo "[ERROR] [ORCHESTRATOR] Shared path (omnia_path) is empty. Phase 1 validation may not have run." - return 1 - fi - - if [ ! -f "/etc/containers/systemd/omnia_core.container" ]; then - echo "[ERROR] [ORCHESTRATOR] Quadlet file not found: /etc/containers/systemd/omnia_core.container" - echo "[ERROR] [ORCHESTRATOR] Cannot proceed with upgrade container swap" - return 1 - fi - - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" - echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." - return 1 - fi - - if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' /etc/containers/systemd/omnia_core.container; then - echo "[ERROR] [ORCHESTRATOR] Failed to update Image in quadlet file" - return 1 - fi - - escaped_omnia_path=$(printf '%s\n' "$omnia_path" | sed 's/[\/&]/\\\\&/g') - if grep -q '^Volume=/omnia\(/\|:\)' /etc/containers/systemd/omnia_core.container; then - if ! sed -i "s|^Volume=/omnia\(/\|:\)|Volume=${escaped_omnia_path}\\1|g" /etc/containers/systemd/omnia_core.container; then - echo "[ERROR] [ORCHESTRATOR] Failed to update Volume paths in quadlet file" - return 1 - fi - fi - - systemctl daemon-reload || return 1 - if ! systemctl restart omnia_core.service; then - echo "[ERROR] [ORCHESTRATOR] Failed to restart omnia_core.service" - systemctl status omnia_core.service --no-pager -l || true - journalctl -xeu omnia_core.service --no-pager | tail -n 120 || true - return 1 - fi - - if ! wait_for_container_health 60; then - echo "[ERROR] [ORCHESTRATOR] Container failed health check after swap" - return 1 - fi - - if ! update_metadata_version; then - return 1 - fi - - if ! sync_input_to_shared_path; then - return 1 - fi - - init_ssh_config - - echo "[INFO] [ORCHESTRATOR] Phase 4: Container swap completed" - return 0 -} - -upgrade_omnia_core() { - local lock_file="/var/lock/omnia_core_upgrade.lock" - local backup_path - - if [ -e "$lock_file" ]; then - echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}" - exit 1 - fi - - mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true - echo "$$" > "$lock_file" || { - echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}" - exit 1 - } - trap 'rm -f "$lock_file"' EXIT - - if ! phase1_validate; then - echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" - exit 1 - fi - - if ! phase2_approval; then - exit 0 - fi - - backup_path="$OMNIA_UPGRADE_BACKUP_PATH" - if [ -z "$backup_path" ]; then - echo "[ERROR] [ORCHESTRATOR] Backup path is empty" - exit 1 - fi - - if ! create_backup "$backup_path"; then - echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 3" - exit 1 - fi - - if ! phase4_container_swap; then - echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" - exit 1 - fi - - echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" - echo "[INFO] [ORCHESTRATOR] Backup location: $backup_path" - exit 0 -} - -# Main function to check if omnia_core container is already running. -# If yes, ask the user if they want to enter the container or reinstall. -# If no, set it up. -main() { - case "$1" in - --install|-i) - install_omnia_core - ;; - --uninstall|-u) - cleanup_omnia_core - ;; - --upgrade) - upgrade_omnia_core - ;; - --version|-v) - display_version - ;; - --help|-h|"") - show_help - ;; - *) - echo "Unknown option: $1" - show_help - exit 1 - ;; - esac -} - -# Call the main function -main "$1" From 66339a9754810d6f7cc98815791b0d9c50a521dd Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 9 Feb 2026 13:13:16 +0000 Subject: [PATCH 088/172] custom slurm confs --- ...-group-login_compiler_node_aarch64.yaml.j2 | 28 ++-- ...i-group-login_compiler_node_x86_64.yaml.j2 | 28 ++-- .../ci-group-login_node_aarch64.yaml.j2 | 28 ++-- .../ci-group-login_node_x86_64.yaml.j2 | 27 ++-- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 9 +- .../ci-group-slurm_node_aarch64.yaml.j2 | 46 ++++-- .../ci-group-slurm_node_x86_64.yaml.j2 | 45 ++++-- discovery/roles/slurm_config/tasks/confs.yml | 20 ++- .../tasks/extract_path_overrides.yml | 147 ++++++++++++++++++ .../tasks/validate_path_overrides.yml | 83 ++++++++++ 10 files changed, 379 insertions(+), 82 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/extract_path_overrides.yml create mode 100644 discovery/roles/slurm_config/tasks/validate_path_overrides.yml diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index de236ed958..c273b54f90 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -204,11 +204,10 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab @@ -221,17 +220,22 @@ - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 3195fad9e3..b7b23c1d33 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -214,11 +214,10 @@ # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab @@ -233,17 +232,22 @@ - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index f869d7d8fe..8b3d771592 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -116,11 +116,10 @@ runcmd: - /usr/local/bin/set-ssh.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab @@ -134,17 +133,22 @@ - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 82646da1c6..4e68ba8d81 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -123,10 +123,10 @@ - /usr/local/bin/set-ssh.sh # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd /var/spool/slurmd nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab @@ -144,18 +144,23 @@ - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + - chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh - - mkdir -p /var/spool/slurmd - - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd +{% for epath in slurm_epilog_custom_paths %} + - bash -c 'if [ ! -f "{{ epath }}" ]; then mkdir -p "$(dirname "{{ epath }}")"; printf "#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n" > "{{ epath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}"; chmod {{ file_mode_755 }} "{{ epath }}"; fi' +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + - bash -c 'if [ ! -f "{{ ppath }}" ]; then mkdir -p "$(dirname "{{ ppath }}")"; printf "#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n" > "{{ ppath }}"; chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}"; chmod {{ file_mode_755 }} "{{ ppath }}"; fi' +{% endfor %} + - mkdir -p {{ slurm_slurmd_spool_dir_effective }} + - chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2f2721d7eb..a8c3b8d88c 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -469,15 +469,18 @@ # slurm user and group created in the users module # Create directories for nfs and mount all - - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + - mkdir -p {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} {{ slurm_ctld_pid_dir_effective }} {{ slurmdbd_pid_dir_effective }} {{ slurm_state_save_location_effective }} {% if slurm_sched_log_dir_effective %}{{ slurm_sched_log_dir_effective }} {% endif %}/etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm /etc/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/my.cnf.d /etc/my.cnf.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/mariadb /var/log/mariadb nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_ctld_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab +{% if slurmdbd_log_dir_effective != slurm_ctld_log_dir_effective %} + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurmdbd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab +{% endif %} {% if powervault_config is not defined %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql /var/lib/mysql nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmctld /var/spool/slurmctld nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmctld {{ slurm_state_save_location_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab {% endif %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index cc784bdd10..a81d564ba6 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -237,13 +237,12 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" echo "[INFO] Creating base directories for Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab @@ -274,22 +273,43 @@ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Setting permissions for Slurm directories" - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Ensuring Slurm epilog directory and logout script permissions" chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh +{% for epath in slurm_epilog_custom_paths %} + + echo "[INFO] Checking custom epilog script: {{ epath }}" + if [ ! -f "{{ epath }}" ]; then + echo "[INFO] Creating stub epilog script at {{ epath }}" + mkdir -p "$(dirname '{{ epath }}')" + printf '#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n' > "{{ epath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}" + chmod {{ file_mode_755 }} "{{ epath }}" + fi +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + + echo "[INFO] Checking custom prolog script: {{ ppath }}" + if [ ! -f "{{ ppath }}" ]; then + echo "[INFO] Creating stub prolog script at {{ ppath }}" + mkdir -p "$(dirname '{{ ppath }}')" + printf '#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n' > "{{ ppath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}" + chmod {{ file_mode_755 }} "{{ ppath }}" + fi +{% endfor %} - echo "[INFO] Creating and configuring /var/spool/slurmd" - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd + echo "[INFO] Creating and configuring slurmd spool directory" + mkdir -p {{ slurm_slurmd_spool_dir_effective }} + chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] ===== Completed slurmd setup (aarch64) =====" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 5128aee1d1..5d930bef47 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -256,12 +256,12 @@ # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd /var/spool/slurmd nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab @@ -292,22 +292,43 @@ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Setting permissions for Slurm directories" - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm + chmod {{ file_mode_755 }} {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] Ensuring Slurm epilog directory and logout script permissions" chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh +{% for epath in slurm_epilog_custom_paths %} + + echo "[INFO] Checking custom epilog script: {{ epath }}" + if [ ! -f "{{ epath }}" ]; then + echo "[INFO] Creating stub epilog script at {{ epath }}" + mkdir -p "$(dirname '{{ epath }}')" + printf '#!/bin/bash\n# Custom epilog script placeholder\n# Add your epilog commands here\n' > "{{ epath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ epath }}" + chmod {{ file_mode_755 }} "{{ epath }}" + fi +{% endfor %} +{% for ppath in slurm_prolog_custom_paths %} + + echo "[INFO] Checking custom prolog script: {{ ppath }}" + if [ ! -f "{{ ppath }}" ]; then + echo "[INFO] Creating stub prolog script at {{ ppath }}" + mkdir -p "$(dirname '{{ ppath }}')" + printf '#!/bin/bash\n# Custom prolog script placeholder\n# Add your prolog commands here\n' > "{{ ppath }}" + chown {{ slurm_user }}:{{ slurm_user }} "{{ ppath }}" + chmod {{ file_mode_755 }} "{{ ppath }}" + fi +{% endfor %} - echo "[INFO] Creating and configuring /var/spool/slurmd" - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd + echo "[INFO] Creating and configuring slurmd spool directory" + mkdir -p {{ slurm_slurmd_spool_dir_effective }} + chmod {{ file_mode_755 }} {{ slurm_slurmd_spool_dir_effective }} + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_spool_dir_effective }} echo "[INFO] ===== Completed slurmd setup =====" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 1ff30acf34..641efc7ab9 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -112,6 +112,12 @@ slurm_conf_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurm') | first).conf_dict }}" when: "'slurm' in conf_merge_dict" +- name: Extract effective path parameters from merged configs + ansible.builtin.include_tasks: extract_path_overrides.yml + +- name: Validate path parameters are absolute + ansible.builtin.include_tasks: validate_path_overrides.yml + - name: Get nodes from normal partition and compare with cmpt_list ansible.builtin.set_fact: normal_partition: "{{ slurm_conf_dict.PartitionName | default([]) | selectattr('PartitionName', 'equalto', slurm_partition_name) | first | default({}) }}" @@ -134,17 +140,17 @@ - nodes_in_normal_not_in_cmpt is defined - nodes_in_normal_not_in_cmpt | length > 0 -- name: Create directories from conf values +- name: Create directories from conf values (NFS server-side always uses defaults) ansible.builtin.include_tasks: exist_dir.yml loop: - "{{ ctld_list - | product([slurm_conf_dict.get('StateSaveLocation', '/var/spool/slurmctld'), - (slurm_conf_dict.get('SlurmctldLogFile', '/var/log/slurmctld.log') | dirname), - (slurm_conf_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid') | dirname)]) }}" + | product(['/var/spool/slurmctld', + '/var/log/slurm', + '/var/run']) }}" - "{{ (cmpt_list + login_list + compiler_login_list) - | product([slurm_conf_dict.get('SlurmdSpoolDir', '/var/spool/slurmd'), - (slurm_conf_dict.get('SlurmdLogFile', '/var/log/slurmd.log') | dirname), - (slurm_conf_dict.get('SlurmdPidFile', '/var/run/slurmd.pid') | dirname)]) }}" + | product(['/var/spool/slurmd', + '/var/log/slurm', + '/var/run']) }}" loop_control: loop_var: product diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml new file mode 100644 index 0000000000..45565dc4e7 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml @@ -0,0 +1,147 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ── Extract merged dicts ────────────────────────────────────────────── + +- name: Extract slurm.conf merged dict + ansible.builtin.set_fact: + slurm_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurm') | first).conf_dict }}" + when: "'slurm' in conf_merge_dict" + +- name: Extract slurmdbd.conf merged dict + ansible.builtin.set_fact: + slurmdbd_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurmdbd') | first).conf_dict }}" + when: "'slurmdbd' in conf_merge_dict" + +- name: Extract cgroup.conf merged dict + ansible.builtin.set_fact: + cgroup_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'cgroup') | first).conf_dict }}" + when: "'cgroup' in conf_merge_dict" + +# ── slurm.conf: controller path params ──────────────────────────────── + +- name: Extract effective controller directories from slurm.conf + ansible.builtin.set_fact: + slurm_ctld_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldLogFile', ['/var/log/slurm/slurmctld.log']) | first if slurm_merged_dict.get('SlurmctldLogFile') is iterable and slurm_merged_dict.get('SlurmctldLogFile') is not string else slurm_merged_dict.get('SlurmctldLogFile', '/var/log/slurm/slurmctld.log')) | dirname }}" + slurm_state_save_location_effective: "{{ (slurm_merged_dict.get('StateSaveLocation', ['/var/spool/slurmctld']) | first if slurm_merged_dict.get('StateSaveLocation') is iterable and slurm_merged_dict.get('StateSaveLocation') is not string else slurm_merged_dict.get('StateSaveLocation', '/var/spool/slurmctld')) }}" + slurm_ctld_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldPidFile', ['/var/run/slurmctld.pid']) | first if slurm_merged_dict.get('SlurmctldPidFile') is iterable and slurm_merged_dict.get('SlurmctldPidFile') is not string else slurm_merged_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid')) | dirname }}" + slurm_sched_log_dir_effective: "{{ ((slurm_merged_dict.get('SlurmSchedLogFile', ['']) | first if slurm_merged_dict.get('SlurmSchedLogFile') is iterable and slurm_merged_dict.get('SlurmSchedLogFile') is not string else slurm_merged_dict.get('SlurmSchedLogFile', '')) | default('', true) | dirname | default('', true)) }}" + when: slurm_merged_dict is defined + +# ── slurm.conf: compute path params ────────────────────────────────── + +- name: Extract effective compute directories from slurm.conf + ansible.builtin.set_fact: + slurm_slurmd_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmdLogFile', ['/var/log/slurm/slurmd.log']) | first if slurm_merged_dict.get('SlurmdLogFile') is iterable and slurm_merged_dict.get('SlurmdLogFile') is not string else slurm_merged_dict.get('SlurmdLogFile', '/var/log/slurm/slurmd.log')) | dirname }}" + slurm_slurmd_spool_dir_effective: "{{ (slurm_merged_dict.get('SlurmdSpoolDir', ['/var/spool/slurmd']) | first if slurm_merged_dict.get('SlurmdSpoolDir') is iterable and slurm_merged_dict.get('SlurmdSpoolDir') is not string else slurm_merged_dict.get('SlurmdSpoolDir', '/var/spool/slurmd')) }}" + slurm_slurmd_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmdPidFile', ['/var/run/slurmd.pid']) | first if slurm_merged_dict.get('SlurmdPidFile') is iterable and slurm_merged_dict.get('SlurmdPidFile') is not string else slurm_merged_dict.get('SlurmdPidFile', '/var/run/slurmd.pid')) | dirname }}" + slurm_epilog_dir_effective: "{{ (slurm_merged_dict.get('Epilog', ['/etc/slurm/epilog.d/logout_user.sh']) | first if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string else slurm_merged_dict.get('Epilog', '/etc/slurm/epilog.d/logout_user.sh')) | dirname }}" + slurm_prolog_dir_effective: "{{ ((slurm_merged_dict.get('Prolog', ['']) | first if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string else slurm_merged_dict.get('Prolog', '')) | default('', true) | dirname | default('', true)) }}" + when: slurm_merged_dict is defined + +# ── slurm.conf: all epilog/prolog dirs and custom file paths ───────── + +- name: Extract all epilog paths from merged Epilog list + ansible.builtin.set_fact: + slurm_epilog_paths_all: >- + {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string + else [slurm_merged_dict.get('Epilog', '')]) + | reject('equalto', '') | list }} + slurm_epilog_dirs_all: >- + {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string + else [slurm_merged_dict.get('Epilog', '')]) + | map('dirname') | unique | reject('equalto', '') | list }} + when: slurm_merged_dict is defined + +- name: Extract custom epilog paths (non-default) + ansible.builtin.set_fact: + slurm_epilog_custom_paths: >- + {{ slurm_epilog_paths_all | reject('search', '^/etc/slurm/epilog\\.d/') | list }} + when: slurm_merged_dict is defined + +- name: Extract all prolog paths from merged Prolog list + ansible.builtin.set_fact: + slurm_prolog_paths_all: >- + {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string + else [slurm_merged_dict.get('Prolog', '')]) + | reject('equalto', '') | list }} + slurm_prolog_dirs_all: >- + {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string + else [slurm_merged_dict.get('Prolog', '')]) + | map('dirname') | unique | reject('equalto', '') | list }} + when: slurm_merged_dict is defined + +- name: Extract custom prolog paths (non-default) + ansible.builtin.set_fact: + slurm_prolog_custom_paths: >- + {{ slurm_prolog_paths_all | list }} + when: slurm_merged_dict is defined + +# ── slurm.conf: plugin dir (both controller and compute) ───────────── + +- name: Extract effective plugin directory from slurm.conf + ansible.builtin.set_fact: + slurm_plugin_dir_effective: "{{ (slurm_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurm_merged_dict.get('PluginDir') is iterable and slurm_merged_dict.get('PluginDir') is not string else slurm_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}" + when: slurm_merged_dict is defined + +# ── slurmdbd.conf path params ──────────────────────────────────────── + +- name: Extract effective directories from slurmdbd.conf + ansible.builtin.set_fact: + slurmdbd_log_dir_effective: "{{ (slurmdbd_merged_dict.get('LogFile', ['/var/log/slurm/slurmdbd.log']) | first if slurmdbd_merged_dict.get('LogFile') is iterable and slurmdbd_merged_dict.get('LogFile') is not string else slurmdbd_merged_dict.get('LogFile', '/var/log/slurm/slurmdbd.log')) | dirname }}" + slurmdbd_pid_dir_effective: "{{ (slurmdbd_merged_dict.get('PidFile', ['/var/run/slurmdbd.pid']) | first if slurmdbd_merged_dict.get('PidFile') is iterable and slurmdbd_merged_dict.get('PidFile') is not string else slurmdbd_merged_dict.get('PidFile', '/var/run/slurmdbd.pid')) | dirname }}" + slurmdbd_plugin_dir_effective: "{{ (slurmdbd_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurmdbd_merged_dict.get('PluginDir') is iterable and slurmdbd_merged_dict.get('PluginDir') is not string else slurmdbd_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}" + when: slurmdbd_merged_dict is defined + +# ── cgroup.conf path params ────────────────────────────────────────── + +- name: Extract effective cgroup mountpoint from cgroup.conf + ansible.builtin.set_fact: + slurm_cgroup_mountpoint_effective: "{{ ((cgroup_merged_dict.get('CgroupMountpoint', ['']) | first if cgroup_merged_dict.get('CgroupMountpoint') is iterable and cgroup_merged_dict.get('CgroupMountpoint') is not string else cgroup_merged_dict.get('CgroupMountpoint', '')) | default('', true)) }}" + when: cgroup_merged_dict is defined + +# ── Defaults when confs are not merged ──────────────────────────────── + +- name: Set default effective directories if slurm.conf not merged + ansible.builtin.set_fact: + slurm_ctld_log_dir_effective: "/var/log/slurm" + slurm_slurmd_log_dir_effective: "/var/log/slurm" + slurm_state_save_location_effective: "/var/spool/slurmctld" + slurm_slurmd_spool_dir_effective: "/var/spool/slurmd" + slurm_ctld_pid_dir_effective: "/var/run" + slurm_slurmd_pid_dir_effective: "/var/run" + slurm_epilog_dir_effective: "/etc/slurm/epilog.d" + slurm_prolog_dir_effective: "" + slurm_sched_log_dir_effective: "" + slurm_plugin_dir_effective: "/usr/lib64/slurm" + slurm_epilog_dirs_all: ["/etc/slurm/epilog.d"] + slurm_epilog_paths_all: ["/etc/slurm/epilog.d/logout_user.sh"] + slurm_epilog_custom_paths: [] + slurm_prolog_dirs_all: [] + slurm_prolog_paths_all: [] + slurm_prolog_custom_paths: [] + when: slurm_merged_dict is not defined + +- name: Set default effective directories if slurmdbd.conf not merged + ansible.builtin.set_fact: + slurmdbd_log_dir_effective: "/var/log/slurm" + slurmdbd_pid_dir_effective: "/var/run" + slurmdbd_plugin_dir_effective: "/usr/lib64/slurm" + when: slurmdbd_merged_dict is not defined + +- name: Set default effective cgroup mountpoint if cgroup.conf not merged + ansible.builtin.set_fact: + slurm_cgroup_mountpoint_effective: "" + when: cgroup_merged_dict is not defined diff --git a/discovery/roles/slurm_config/tasks/validate_path_overrides.yml b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml new file mode 100644 index 0000000000..140b1d4bda --- /dev/null +++ b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml @@ -0,0 +1,83 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ── slurm.conf path validation ─────────────────────────────────────── + +- name: Validate slurm.conf path parameters are absolute + ansible.builtin.fail: + msg: "slurm.conf {{ item }} must be an absolute path (start with /). Current value: {{ slurm_merged_dict.get(item) }}" + when: + - slurm_merged_dict is defined + - slurm_merged_dict.get(item) is defined + - slurm_merged_dict.get(item) is not none + - (slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | length > 0) or (slurm_merged_dict.get(item) is iterable and slurm_merged_dict.get(item) | list | length > 0) + - not ((slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | regex_search('^/')) or (slurm_merged_dict.get(item) is iterable and (slurm_merged_dict.get(item) | first) | regex_search('^/'))) + loop: + - SlurmctldLogFile + - SlurmdLogFile + - StateSaveLocation + - SlurmdSpoolDir + - SlurmctldPidFile + - SlurmdPidFile + - Epilog + - Prolog + - EpilogSlurmctld + - PrologSlurmctld + - SlurmSchedLogFile + - PluginDir + - PlugStackConfig + - SrunEpilog + - SrunProlog + - TaskEpilog + - TaskProlog + - HealthCheckProgram + - RebootProgram + - UnkillableStepProgram + - ResvEpilog + - ResvProlog + - TmpFS + - JobCompLoc + - JobCredentialPrivateKey + - JobCredentialPublicCertificate + +# ── slurmdbd.conf path validation ──────────────────────────────────── + +- name: Validate slurmdbd.conf path parameters are absolute + ansible.builtin.fail: + msg: "slurmdbd.conf {{ item }} must be an absolute path (start with /). Current value: {{ slurmdbd_merged_dict.get(item) }}" + when: + - slurmdbd_merged_dict is defined + - slurmdbd_merged_dict.get(item) is defined + - slurmdbd_merged_dict.get(item) is not none + - (slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | length > 0) or (slurmdbd_merged_dict.get(item) is iterable and slurmdbd_merged_dict.get(item) | list | length > 0) + - not ((slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | regex_search('^/')) or (slurmdbd_merged_dict.get(item) is iterable and (slurmdbd_merged_dict.get(item) | first) | regex_search('^/'))) + loop: + - LogFile + - PidFile + - PluginDir + +# ── cgroup.conf path validation ────────────────────────────────────── + +- name: Validate cgroup.conf path parameters are absolute + ansible.builtin.fail: + msg: "cgroup.conf {{ item }} must be an absolute path (start with /). Current value: {{ cgroup_merged_dict.get(item) }}" + when: + - cgroup_merged_dict is defined + - cgroup_merged_dict.get(item) is defined + - cgroup_merged_dict.get(item) is not none + - (cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | length > 0) or (cgroup_merged_dict.get(item) is iterable and cgroup_merged_dict.get(item) | list | length > 0) + - not ((cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | regex_search('^/')) or (cgroup_merged_dict.get(item) is iterable and (cgroup_merged_dict.get(item) | first) | regex_search('^/'))) + loop: + - CgroupMountpoint From a68829764b761d53282d1c7db523029a2a03da60 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Mon, 9 Feb 2026 18:58:32 +0530 Subject: [PATCH 089/172] Fixed issue where pulp repo resynced crashed before pub/dist creation Signed-off-by: pullan1 --- common/library/modules/process_rpm_config.py | 91 ++++++++++- common/library/modules/pulp_cleanup.py | 153 ++++++++++++++++--- local_repo/pulp_cleanup.yml | 6 +- 3 files changed, 219 insertions(+), 31 deletions(-) diff --git a/common/library/modules/process_rpm_config.py b/common/library/modules/process_rpm_config.py index 002923d50c..89a8f0e1ca 100644 --- a/common/library/modules/process_rpm_config.py +++ b/common/library/modules/process_rpm_config.py @@ -467,6 +467,27 @@ def check_publication_exists(repo_name, log): log.error("Error checking publication for '%s': %s", repo_name, str(e)) return False +def check_distribution_exists(repo_name, log): + """ + Check if a distribution exists for the repository. + + Args: + repo_name (str): The name of the repository. + log (logging.Logger): Logger instance for logging. + + Returns: + bool: True if distribution exists, False otherwise. + """ + try: + command = pulp_rpm_commands["check_distribution"] % repo_name + log.info("Checking if distribution exists for repository '%s'", repo_name) + result = execute_command(command, log) + return bool(result) + except Exception as e: + log.error("Error checking distribution for '%s': %s", repo_name, str(e)) + return False + + def delete_old_publications(repo_name, log): """ Delete all existing publications for a repository. @@ -792,9 +813,43 @@ def process_sync_results(sync_results, rpm_config, resync_repos, log): version_changed_repos = [name for success, name, actually_synced, version_changed in sync_results if success and actually_synced and version_changed] log.info(f"Repos with version change: {len(version_changed_repos)} - {version_changed_repos}") - # If no versions changed, skip publication and distribution entirely + # If no versions changed, check for missing publication/distribution + # This handles the crash recovery case: process failed after sync but before pub/dist if not version_changed_repos: - log.info("No version changes detected. Skipping publication and distribution.") + log.info("No version changes detected. Checking for missing publication/distribution.") + + # Check all synced repos (including previously synced) for missing pub/dist + repos_missing_pub_dist = [] + all_repo_names = [] + for repo in rpm_config: + repo_name = repo["package"] + version = repo.get("version") + if version and version != "null": + repo_name = f"{repo_name}_{version}" + all_repo_names.append(repo_name) + + # If resync_repos is a specific list, only check those repos + if resync_repos and resync_repos != "all": + resync_list = resync_repos if isinstance(resync_repos, list) else [r.strip() for r in resync_repos.split(",")] + if repo_name not in resync_list: + continue + + pub_exists = check_publication_exists(repo_name, log) + dist_exists = check_distribution_exists(repo_name, log) + + if not pub_exists or not dist_exists: + log.info(f"{repo_name} missing publication={not pub_exists}, distribution={not dist_exists}. Including for pub/dist creation.") + repo_copy = repo.copy() + repo_copy["_version_changed"] = False + repos_missing_pub_dist.append(repo_copy) + + if repos_missing_pub_dist: + missing_names = [r["package"] for r in repos_missing_pub_dist] + log.info(f"Found {len(repos_missing_pub_dist)} repo(s) missing publication/distribution: {missing_names}") + return repos_missing_pub_dist, False, "" + + # All repos have publication and distribution - safe to skip + log.info("All repos have existing publication and distribution. Skipping.") if actually_synced_repos: # Repos were synced but no metadata change synced_list = ", ".join(actually_synced_repos) @@ -820,9 +875,37 @@ def process_sync_results(sync_results, rpm_config, resync_repos, log): repos_for_pub_dist.append(repo_copy) return repos_for_pub_dist, False, "" else: - # If no repos were actually synced, skip publication and distribution + # If no repos were actually synced, check for missing pub/dist (crash recovery) if not actually_synced_repos: - log.info("No repos were actually synced. Skipping publication and distribution.") + log.info("No repos were actually synced. Checking for missing publication/distribution.") + repos_missing_pub_dist = [] + for repo in rpm_config: + repo_name = repo["package"] + version = repo.get("version") + if version and version != "null": + repo_name = f"{repo_name}_{version}" + + # If resync_repos is a specific list, only check those repos + if resync_repos and resync_repos != "all": + resync_list = resync_repos if isinstance(resync_repos, list) else [r.strip() for r in resync_repos.split(",")] + if repo_name not in resync_list: + continue + + pub_exists = check_publication_exists(repo_name, log) + dist_exists = check_distribution_exists(repo_name, log) + + if not pub_exists or not dist_exists: + log.info(f"{repo_name} missing publication={not pub_exists}, distribution={not dist_exists}. Including for pub/dist creation.") + repo_copy = repo.copy() + repo_copy["_version_changed"] = False + repos_missing_pub_dist.append(repo_copy) + + if repos_missing_pub_dist: + missing_names = [r["package"] for r in repos_missing_pub_dist] + log.info(f"Found {len(repos_missing_pub_dist)} repo(s) missing publication/distribution: {missing_names}") + return repos_missing_pub_dist, False, "" + + log.info("All repos have existing publication and distribution. No updates required.") return [], True, "All repositories already synced - no updates required" # Filter rpm_config to only include repos with version change diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index 91b863144a..217ca9b308 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -216,6 +216,16 @@ def file_exists_in_status(name: str, base_path: str, logger) -> bool: except Exception: return False +def get_all_repositories(logger) -> List[str]: + """Get all RPM repository names from Pulp.""" + cmd = pulp_rpm_commands["list_repositories"] + result = run_cmd(cmd, logger) + if result["rc"] != 0: + logger.error(f"Failed to list repositories: {result['stderr']}") + return [] + repos = safe_json_parse(result["stdout"]) + return [r.get('name', '') for r in repos if r.get('name')] + # ============================================================================= # CLEANUP FUNCTIONS @@ -708,11 +718,12 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: return {} -def mark_software_partial(affected_software: Dict[str, List[str]], base_path: str, logger, artifact_type: str = None): +def mark_software_partial(affected_software, base_path: str, logger, artifact_type: str = None): """Mark software entries as partial in software.csv. Args: - affected_software: Dict mapping architecture to list of affected software names + affected_software: Either a List[str] of software names (from remove_rpms_from_repository) + or a Dict[str, List[str]] mapping arch to software names (from remove_from_status_files) base_path: Base path for software.csv logger: Logger instance artifact_type: Type of artifact being removed (for logging purposes) @@ -721,39 +732,119 @@ def mark_software_partial(affected_software: Dict[str, List[str]], base_path: st if not affected_software: logger.info("No affected software to mark as partial") return + + # Normalize input: if a flat list is passed, apply to all architectures + if isinstance(affected_software, list): + arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES} + else: + arch_software_map = affected_software try: - # Only mark architectures where artifacts were actually removed - for arch, software_names in affected_software.items(): - logger.info(f"Processing arch: {arch}, software_names: {software_names}") + for arch, software_names in arch_software_map.items(): if not software_names: continue software_file = f"{base_path}/{arch}/software.csv" logger.info(f"Looking for software file: {software_file}") - if os.path.exists(software_file): - rows = [] - updated = False - with open(software_file, 'r') as f: - reader = csv.DictReader(f) - fieldnames = reader.fieldnames - for row in reader: - logger.info(f"Checking row: {row}") - if row.get('name') in software_names: - row['status'] = 'partial' - updated = True - logger.info(f"Marked '{row.get('name')}' as {GREEN}partial{RESET} in {arch}/software.csv ({artifact_type} cleanup)") - rows.append(row) + if not os.path.exists(software_file): + logger.warning(f"Software file not found: {software_file}") + continue - if fieldnames and rows: - with open(software_file, 'w', newline='') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) - logger.info(f"Successfully wrote updated software.csv for {arch}") + rows = [] + updated = False + with open(software_file, 'r') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + if row.get('name') in software_names: + row['status'] = 'partial' + updated = True + logger.info(f"Marked '{row.get('name')}' as partial in {arch}/software.csv ({artifact_type} cleanup)") + rows.append(row) + + if fieldnames and rows and updated: + with open(software_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + logger.info(f"Successfully wrote updated software.csv for {arch}") except Exception as e: logger.error(f"Failed to update software.csv: {e}") +def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> bool: + """Check if a software has any RPM dependencies in its status.csv. + + Args: + software_name: Name of the software + arch: Architecture (x86_64 or aarch64) + base_path: Base path for status files + logger: Logger instance + + Returns: + True if software has RPM entries, False otherwise + """ + status_file = f"{base_path}/{arch}/{software_name}/status.csv" + if not os.path.exists(status_file): + return False + + try: + with open(status_file, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + if row.get('type', '').lower() == 'rpm': + return True + return False + except Exception as e: + logger.error(f"Error checking RPMs for {software_name}: {e}") + return False + + +def mark_all_software_partial(base_path: str, logger): + """Mark software entries as partial in software.csv for all architectures. + + This is called when cleanup_repos=all to mark software as partial + since all RPM repositories are being deleted. + Only marks software that actually has RPM dependencies. + + Args: + base_path: Base path for software.csv files + logger: Logger instance + """ + logger.info("Marking software with RPM dependencies as partial (cleanup_repos=all)") + try: + for arch in ARCH_SUFFIXES: + software_file = f"{base_path}/{arch}/software.csv" + logger.info(f"Processing software file: {software_file}") + + if not os.path.exists(software_file): + logger.info(f"Software file not found: {software_file}") + continue + + rows = [] + updated = False + with open(software_file, 'r') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + software_name = row.get('name', '') + if row.get('status') == 'success': + # Only mark as partial if software has RPM dependencies + if software_has_rpms(software_name, arch, base_path, logger): + row['status'] = 'partial' + updated = True + logger.info(f"Marked '{software_name}' as partial in {arch}/software.csv (has RPM deps)") + else: + logger.info(f"Skipping '{software_name}' - no RPM dependencies") + rows.append(row) + + if fieldnames and rows and updated: + with open(software_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + logger.info(f"Successfully updated {software_file}") + except Exception as e: + logger.error(f"Failed to mark all software as partial: {e}") def write_cleanup_status(results: List[Dict], base_path: str): """Write cleanup results to status file.""" @@ -794,6 +885,16 @@ def run_module(): os.makedirs(base_path, exist_ok=True) logger = setup_standard_logger(log_dir) + # Handle 'all' keyword for repositories only + cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all' + #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all': + if cleanup_all_repos: + logger.info("cleanup_repos='all' - fetching all repositories from Pulp") + cleanup_repos = get_all_repositories(logger) + if not cleanup_repos: + module.fail_json(msg="Failed to retrieve repository list from Pulp. Please check if Pulp services are running.") + logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}") + logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}") all_results = [] @@ -804,6 +905,10 @@ def run_module(): all_results.append(result) logger.info(f"Repository {repo}: {result['status']} - {result['message']}") + # If cleanup_repos=all, mark software with RPM dependencies as partial + if cleanup_all_repos and any(r['status'] == 'Success' for r in all_results if r['type'] == 'repository'): + mark_all_software_partial(base_path, logger) + # Process containers for container in cleanup_containers: result = cleanup_container(container, base_path, logger) diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index 123b3a481f..f999b3a2dc 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -40,9 +40,9 @@ # Step 2: User Confirmation - name: Parse cleanup lists ansible.builtin.set_fact: - repo_list: "{{ (cleanup_repos.split(',') if cleanup_repos != 'all' else []) if cleanup_repos is defined else [] }}" - container_list: "{{ (cleanup_containers.split(',') if cleanup_containers is string else cleanup_containers) | default([]) }}" - file_list: "{{ (cleanup_files.split(',') if cleanup_files is string else cleanup_files) | default([]) }}" + repo_list: "{{ cleanup_repos.split(',') | map('trim') | list if cleanup_repos is string else (cleanup_repos | default([])) }}" + container_list: "{{ cleanup_containers.split(',') | map('trim') | list if cleanup_containers is string else (cleanup_containers | default([])) }}" + file_list: "{{ cleanup_files.split(',') | map('trim') | list if cleanup_files is string else (cleanup_files | default([])) }}" - name: Display cleanup summary ansible.builtin.debug: From 64e12f90d65203221dfdac424baee6b8750bc2b9 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Mon, 9 Feb 2026 13:30:36 +0000 Subject: [PATCH 090/172] code fix Signed-off-by: sakshi-singla-1735 --- ...i-group-login_compiler_node_x86_64.yaml.j2 | 64 +++++++------- .../ci-group-slurm_node_x86_64.yaml.j2 | 2 +- .../hpc_tools/configure_nvhpc_env.sh.j2 | 12 +-- .../hpc_tools/configure_ucx_openmpi_env.sh.j2 | 15 +--- .../hpc_tools/export_nvhpc_env.sh.j2 | 21 +---- .../hpc_tools/install_nvhpc_sdk.sh.j2 | 36 ++++---- .../templates/hpc_tools/install_openmpi.sh.j2 | 85 ++++++++++++++++--- .../templates/hpc_tools/install_ucx.sh.j2 | 61 ++++++++++--- .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2 | 33 ++----- 9 files changed, 183 insertions(+), 146 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index a1f8a55f50..79e50eb774 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -254,6 +254,39 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} + # Add NFS entry and mount + - mkdir -p {{ client_mount_path }} + - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -a +{% endif %} + +{% if hostvars['localhost']['ucx_support'] %} + - echo "===== UCX Setup =====" + - echo "UCX support is enabled." + - /usr/local/bin/install_ucx.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_ucx.sh" + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['openmpi_support'] %} + - echo "===== OpenMPI Setup =====" + - echo "OpenMPI support is enabled." + - /usr/local/bin/install_openmpi.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_openmpi.sh" + # - echo "Run UCX installation first if UCX support is enabled." + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['ldms_support'] %} + - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log + + - /root/ldms_sampler.sh +{% endif %} + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh @@ -315,37 +348,6 @@ {% endif %} -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} - # Add NFS entry and mount - - mkdir -p {{ client_mount_path }} - - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - - mount -a -{% endif %} - -{% if hostvars['localhost']['ucx_support'] %} - - echo "===== UCX Setup =====" - - echo "UCX support is enabled." - - /usr/local/bin/install_ucx.sh - # - echo "Build script available at" - # - echo " /usr/local/bin/install_ucx.sh" - # - echo "NFS must be mounted at {{ client_mount_path }} before running." -{% endif %} - -{% if hostvars['localhost']['openmpi_support'] %} - - echo "===== OpenMPI Setup =====" - - echo "OpenMPI support is enabled." - - /usr/local/bin/install_openmpi.sh - # - echo "Build script available at" - # - echo " /usr/local/bin/install_openmpi.sh" - # - echo "Run UCX installation first if UCX support is enabled." - # - echo "NFS must be mounted at {{ client_mount_path }} before running." -{% endif %} - -{% if hostvars['localhost']['ldms_support'] %} - - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - - /root/ldms_sampler.sh -{% endif %} # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 64315adf38..84440bbdec 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -474,7 +474,7 @@ - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - mount -a - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." - - echo "Shared NFS mount is available at: {{ client_mount_path }}" + # - echo "Shared NFS mount is available at: {{ client_mount_path }}" - /usr/local/bin/configure_ucx_openmpi_env.sh # - echo "" # - echo "IMPORTANT:" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 index 3c7efbc88b..dfc30520b3 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 @@ -50,8 +50,6 @@ if [ -f "$PROFILE_FILE" ]; then grep -q "nvhpc.sh" /etc/bashrc || echo "source $PROFILE_FILE" >> /etc/bashrc fi -# NVHPC marker file path -MARKER_TARGET="{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}/.nvhpc_env_ready" if ! grep -q "{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" /etc/fstab; then echo "[ERROR] NVHPC NFS path not found in /etc/fstab" @@ -60,12 +58,4 @@ fi echo "[INFO] NVHPC NFS entry found in /etc/fstab" -if [ ! -d "{{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" ]; then - echo "[ERROR] Marker directory missing: {{ nvhpc_local_mount | default('/shared-nvhpc-sdk/nvhpc') }}" - exit 1 -fi - -touch "$MARKER_TARGET" -echo "[SUCCESS] NVHPC marker created: $MARKER_TARGET" - -echo "===== NVHPC environment configuration completed successfully =====" \ No newline at end of file +echo "===== NVHPC environment configuration completed successfully =====" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 index 4064eddbb1..0fa20205c5 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_ucx_openmpi_env.sh.j2 @@ -17,8 +17,6 @@ if ! mountpoint -q "$CLIENT_MOUNT"; then fi # ---------------- UCX ---------------- -if [ -d "$UCX_PREFIX/bin" ]; then - echo "[INFO] UCX detected at $UCX_PREFIX" cat > "$PROFILE_DIR/ucx.sh" < "$PROFILE_DIR/openmpi.sh" </dev/null"; then - echo "[ERROR] nvc verification failed" - exit 1 -fi - -# Verify nvfortran -if ! bash -lc "command -v nvfortran && nvfortran --version >/dev/null"; then - echo "[ERROR] nvfortran verification failed" - exit 1 -fi echo "[SUCCESS] NVHPC environment exported successfully" echo "[INFO] Environment file configured in $PROFILE_FILE" -echo "===== NVHPC export completed =====" \ No newline at end of file +echo "===== NVHPC export completed =====" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 index 26f3fd1775..75478a470e 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 @@ -2,9 +2,8 @@ set -e LOGFILE="/var/log/nvhpc_sdk_install.log" -exec > >(tee -a "$LOGFILE") 2>&1 -echo "===== Starting NVIDIA HPC SDK installation =====" +echo "===== Starting NVIDIA HPC SDK installation =====" | tee -a "$LOGFILE" NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}" NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" @@ -16,49 +15,49 @@ NVHPC_EXTRACT_DIR="$NVHPC_MOUNT/${NVHPC_PKG_NAME}" # Skip if already mounted if mountpoint -q "$NVHPC_LOCAL_MOUNT"; then - echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation." + echo "[INFO] $NVHPC_LOCAL_MOUNT already mounted. Skipping installation." | tee -a "$LOGFILE" exit 0 fi # Skip if local directory exists if [ -d "$NVHPC_LOCAL_MOUNT" ]; then - echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping." + echo "[INFO] $NVHPC_LOCAL_MOUNT exists. Assuming installed. Skipping." | tee -a "$LOGFILE" exit 0 fi mkdir -p "$NVHPC_MOUNT" -mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT" +mount -t nfs "$NVHPC_EXPORT" "$NVHPC_MOUNT" >> "$LOGFILE" 2>&1 # Check tarball -echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..." +echo "[INFO] Checking NVIDIA HPC SDK tarball at $NVHPC_TARBALL..." | tee -a "$LOGFILE" if [ ! -f "$NVHPC_TARBALL" ]; then - echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation." + echo "[ERROR] NVIDIA HPC SDK tarball not found. Skipping installation." | tee -a "$LOGFILE" exit 0 fi # Extract if needed EXTRACT_SIZE_GB=$(du -sBG "$NVHPC_EXTRACT_DIR" 2>/dev/null | cut -f1 | tr -d 'G') if [ -d "$NVHPC_EXTRACT_DIR" ] && [ "$EXTRACT_SIZE_GB" -ge 13 ] && [ -f "$NVHPC_EXTRACT_DIR/install" ]; then - echo "[INFO] NVHPC already extracted. Skipping." + echo "[INFO] NVHPC already extracted. Skipping." | tee -a "$LOGFILE" else - echo "[INFO] Extracting NVIDIA HPC SDK tarball..." + echo "[INFO] Extracting NVIDIA HPC SDK tarball..." | tee -a "$LOGFILE" tar -xzf "$NVHPC_TARBALL" -C "$NVHPC_MOUNT" \ --checkpoint=2000 \ - --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait" + --checkpoint-action=echo="[INFO] Extracting NVHPC... please wait" >> "$LOGFILE" 2>&1 fi mkdir -p "$NVHPC_INSTALL_DIR_NFS" INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_x86_64/25.11/compilers/bin" if [ -x "$INSTALL_BIN_DIR/nvc" ]; then - echo "[INFO] NVHPC already installed. Skipping installer." + echo "[INFO] NVHPC already installed. Skipping installer." | tee -a "$LOGFILE" else - echo "[INFO] Running NVIDIA HPC SDK installer..." + echo "[INFO] Running NVIDIA HPC SDK installer..." | tee -a "$LOGFILE" cd "$NVHPC_EXTRACT_DIR" - NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install + NVHPC_SILENT=true NVHPC_INSTALL_DIR="$NVHPC_INSTALL_DIR_NFS" NVHPC_INSTALL_TYPE=auto ./install >> "$LOGFILE" 2>&1 fi -echo "[SUCCESS] NVIDIA HPC SDK installation completed." +echo "[SUCCESS] NVIDIA HPC SDK installation completed." | tee -a "$LOGFILE" # Mount NVHPC locally mkdir -p "$NVHPC_LOCAL_MOUNT" @@ -66,10 +65,11 @@ NVHPC_INSTALL_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" FSTAB_ENTRY="$NVHPC_INSTALL_EXPORT $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" if ! grep -qE "^[^#].*$NVHPC_INSTALL_EXPORT[[:space:]]+$NVHPC_LOCAL_MOUNT[[:space:]]+nfs" /etc/fstab; then - echo "[INFO] Adding NVHPC mount to /etc/fstab" + echo "[INFO] Adding NVHPC mount to /etc/fstab" | tee -a "$LOGFILE" echo "$FSTAB_ENTRY" >> /etc/fstab fi -echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..." -mount "$NVHPC_LOCAL_MOUNT" -echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT" \ No newline at end of file +echo "[INFO] Mounting $NVHPC_LOCAL_MOUNT..." | tee -a "$LOGFILE" +mount "$NVHPC_LOCAL_MOUNT" >> "$LOGFILE" 2>&1 +echo "[INFO] NVHPC successfully mounted at $NVHPC_LOCAL_MOUNT" | tee -a "$LOGFILE" +echo "CLOUD-INIT: NVIDIA HPC SDK installation completed successfully" | tee -a "$LOGFILE" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 index 44e1a786b7..9adde78472 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 @@ -5,6 +5,18 @@ CLIENT_MOUNT="{{ client_mount_path }}" OPENMPI_PREFIX="{{ client_mount_path }}/slurm/hpc_tools/benchmarks/openmpi" OPENMPI_BUILD="{{ client_mount_path }}/slurm/hpc_tools/compile/openmpi" +# Comprehensive logging +LOGFILE="/var/log/openmpi_installation.log" + +# Redirect all output to log file +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== OpenMPI Installation Started =====" +echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')" +echo "Installation Prefix: $OPENMPI_PREFIX" +echo "Build Directory: $OPENMPI_BUILD" +echo "Log File: $LOGFILE" | tee -a "$LOGFILE" + # Check that NFS is mounted if ! mountpoint -q "$CLIENT_MOUNT"; then echo "[ERROR] $CLIENT_MOUNT is not mounted." @@ -14,49 +26,65 @@ fi echo "===== OpenMPI build started =====" -mkdir -p "$OPENMPI_BUILD" "$OPENMPI_PREFIX" +mkdir -p "$OPENMPI_BUILD" cd "$OPENMPI_BUILD" if [ ! -f openmpi.tar.gz ]; then + echo "[INFO] Downloading OpenMPI source code..." wget --no-check-certificate \ https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \ - -O openmpi.tar.gz \ - >> "$OPENMPI_PREFIX/openmpi_tar_output.log" 2>&1 + -O openmpi.tar.gz >> "$LOGFILE" 2>&1 + echo "[INFO] OpenMPI download completed" else - echo "openmpi.tar.gz already exists, skipping download." \ - >> "$OPENMPI_PREFIX/openmpi_tar_output.log" + echo "[INFO] openmpi.tar.gz already exists, skipping download." fi -tar xzf openmpi.tar.gz +echo "[INFO] Extracting OpenMPI source code..." +tar xzf openmpi.tar.gz >> "$LOGFILE" 2>&1 cd openmpi-* +echo "[INFO] OpenMPI source extracted to $(pwd)" + +echo "[INFO] Creating build directory..." mkdir -p build # Slurm detection +echo "[INFO] Detecting Slurm integration..." if sinfo >/dev/null 2>&1; then SLURM_FLAG="--with-slurm=yes --with-munge=/usr" + echo "[INFO] Slurm detected - enabling Slurm integration" else SLURM_FLAG="--with-slurm=no" + echo "[INFO] Slurm not detected - disabling Slurm integration" fi # UCX detection +echo "[INFO] Detecting UCX integration..." if [ -x "{{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx/bin/ucx_info" ]; then UCX_FLAG="--with-ucx={{ client_mount_path }}/slurm/hpc_tools/benchmarks/ucx" + echo "[INFO] UCX detected - enabling UCX integration" else UCX_FLAG="" + echo "[INFO] UCX not detected - proceeding without UCX" fi cd build +echo "[INFO] Configuring OpenMPI build..." +echo "[INFO] Configure flags: --prefix=$OPENMPI_PREFIX --enable-mpi1-compatibility --enable-prte-prefix-by-default $SLURM_FLAG $UCX_FLAG" ../configure --prefix="$OPENMPI_PREFIX" \ --enable-mpi1-compatibility \ --enable-prte-prefix-by-default \ - $SLURM_FLAG $UCX_FLAG + $SLURM_FLAG $UCX_FLAG >> "$LOGFILE" 2>&1 -make -j {{ openmpi_build_threads | default(8) }} -make install +echo "[INFO] Building OpenMPI with {{ openmpi_build_threads | default(8) }} threads..." +make -j {{ openmpi_build_threads | default(8) }} >> "$LOGFILE" 2>&1 + +echo "[INFO] Installing OpenMPI..." +make install >> "$LOGFILE" 2>&1 # Configure OpenMPI environment variables system-wide OPENMPI_ENV_FILE="/etc/profile.d/openmpi.sh" +echo "[INFO] Setting up OpenMPI environment variables in $OPENMPI_ENV_FILE..." cat > "$OPENMPI_ENV_FILE" <> "$UCX_PREFIX/ucx_tar_output.log" 2>&1 + -O ucx.tar.gz >> "$LOGFILE" 2>&1 + echo "[INFO] UCX download completed" else - echo "ucx.tar.gz already exists, skipping download." \ - >> "$UCX_PREFIX/ucx_tar_output.log" + echo "[INFO] ucx.tar.gz already exists, skipping download." fi -tar xzf ucx.tar.gz +echo "[INFO] Extracting UCX source code..." +tar xzf ucx.tar.gz >> "$LOGFILE" 2>&1 cd ucx-* +echo "[INFO] UCX source extracted to $(pwd)" + +echo "[INFO] Creating build directory..." mkdir -p build cd build -../contrib/configure-release --prefix="$UCX_PREFIX" -make -j {{ ucx_build_threads | default(8) }} -make install +echo "[INFO] Configuring UCX build..." +../contrib/configure-release --prefix="$UCX_PREFIX" >> "$LOGFILE" 2>&1 + +echo "[INFO] Building UCX with {{ ucx_build_threads | default(8) }} threads..." +make -j {{ ucx_build_threads | default(8) }} >> "$LOGFILE" 2>&1 + +echo "[INFO] Installing UCX..." +make install >> "$LOGFILE" 2>&1 # Configure UCX environment variables system-wide UCX_ENV_FILE="/etc/profile.d/ucx.sh" +echo "[INFO] Setting up UCX environment variables in $UCX_ENV_FILE..." cat > "$UCX_ENV_FILE" <> /etc/fstab - echo "[INFO] NVHPC fstab entry added" + echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT none bind,_netdev 0 0" >> /etc/fstab + echo "[INFO] NVHPC bind-mount fstab entry added" else echo "[INFO] NVHPC fstab entry already present" fi @@ -56,7 +37,7 @@ fi mkdir -p "$NVHPC_LOCAL_MOUNT" if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then - mount "$NVHPC_LOCAL_MOUNT" + mount --bind "$NVHPC_NFS_SHARE" "$NVHPC_LOCAL_MOUNT" fi if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then @@ -65,4 +46,4 @@ if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then fi echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT" -echo "===== NVHPC setup completed =====" \ No newline at end of file +echo "===== NVHPC setup completed =====" From ee524cd4fe42064ee8d97486fb822ca3087be51a Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 9 Feb 2026 19:01:11 +0530 Subject: [PATCH 091/172] Added validation for all confs Supported configuration files are: slurm.conf slurmdbd.conf cgroup.conf gres.conf acct_gather.conf helpers.conf job_container.conf mpi.conf oci.conf topology.conf burst_buffer.conf --- .../common_utils/slurm_conf_utils.py | 13 +++++-- .../validation_flows/common_validation.py | 16 ++++++--- common/library/modules/slurm_conf.py | 8 +++-- .../roles/slurm_config/defaults/main.yml | 7 ++++ discovery/roles/slurm_config/tasks/confs.yml | 14 +++++++- .../slurm_config/tasks/handle_extra_confs.yml | 35 +++++++++++++++++++ discovery/roles/slurm_config/vars/main.yml | 28 +++++++++++++++ 7 files changed, 110 insertions(+), 11 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/handle_extra_confs.yml diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 20d61afc98..26f24762aa 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -763,6 +763,10 @@ class SlurmParserEnum(str, Enum): def validate_config_types(conf_dict, conf_name, module): """Validate configuration keys and value types based on SlurmParserEnum.""" current_conf = all_confs.get(conf_name, {}) + if not current_conf: + return {'invalid_keys': [], 'type_errors': []} + # module.fail_json(msg=f"Invalid configuration name: {conf_name}", conf_dict=conf_dict, current_conf=current_conf) + module.warn(conf_name) invalid_keys = list( set(conf_dict.keys()).difference(set(current_conf.keys()))) type_errors = [] @@ -839,6 +843,7 @@ def parse_slurm_conf(file_path, conf_name, validate): """Parses the slurm.conf file and returns it as a dictionary.""" current_conf = all_confs.get(conf_name, {}) slurm_dict = OrderedDict() + dup_keys = [] if not os.path.exists(file_path): raise FileNotFoundError(f"{file_path} not found.") @@ -878,9 +883,11 @@ def parse_slurm_conf(file_path, conf_name, validate): slurm_dict[skey] = list(slurm_dict.get( skey, [])) + list(tmp_dict.values()) else: - slurm_dict.update(tmp_dict) - - return slurm_dict + if skey in slurm_dict: + dup_keys.append(skey) + else: + slurm_dict.update(tmp_dict) + return slurm_dict, dup_keys def expand_hostlist(expr): diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index ae4e693b9e..198c527440 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1062,24 +1062,30 @@ def validate_omnia_config( f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") for cfg_path_dict in cnfg_src: for k,v in cfg_path_dict.items(): conf_dict = None if isinstance(v, str): if not os.path.exists(v): errors.append( - create_error_msg(input_file_path, "slurm_cluster config_sources", + create_error_msg('omnia_config.yml', "slurm_cluster config_sources", f"provided conf path for {k} - {v} does not exist")) continue else: # path exists - conf_dict = parse_slurm_conf(v, k, False) + if not skip_conf_validation: + conf_dict, duplicate_keys = parse_slurm_conf(v, k, False) + if duplicate_keys: + errors.append( + create_error_msg('omnia_config.yml', "slurm_cluster->config_sources", + f"duplicate keys found in {k}.conf - {','.join(duplicate_keys)}")) else: conf_dict = v - if conf_dict: + if conf_dict and not skip_conf_validation: validation_result = validate_config_types(conf_dict, k, module) - if validation_result['type_errors']: + if validation_result.get('type_errors'): errors.extend(validation_result['type_errors']) - if validation_result['invalid_keys']: + if validation_result.get('invalid_keys'): errors.append( create_error_msg('omnia_config.yml', "slurm_cluster->config_sources", f"{k}.conf invalid keys found - {','.join(validation_result['invalid_keys'])}")) diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py index dcacbcae2f..78a4315244 100644 --- a/common/library/modules/slurm_conf.py +++ b/common/library/modules/slurm_conf.py @@ -234,7 +234,9 @@ def run_module(): replace = module.params['replace'] # Parse the slurm.conf file if module.params['op'] == 'parse': - s_dict = parse_slurm_conf(module.params['path'], conf_name, validate) + s_dict, dup_keys = parse_slurm_conf(module.params['path'], conf_name, validate) + if dup_keys: + module.fail_json(msg=f"Duplicate keys found in {module.params['path']}: {dup_keys}") result['conf_dict'] = s_dict elif module.params['op'] == 'render': s_list = read_dict2ini(module.params['conf_map']) @@ -247,7 +249,9 @@ def run_module(): elif isinstance(conf_source, str): if not os.path.exists(conf_source): raise FileNotFoundError(f"File {conf_source} does not exist") - s_dict = parse_slurm_conf(conf_source, conf_name, validate) + s_dict, dup_keys = parse_slurm_conf(conf_source, conf_name, validate) + if dup_keys: + module.fail_json(msg=f"Duplicate keys found in {conf_source}: {dup_keys}") conf_dict_list.append(OrderedDict(s_dict)) else: raise TypeError(f"Invalid type for conf_source: {type(conf_source)}") diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index a8fbc8e9c8..ad7ab09058 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -87,3 +87,10 @@ __default_config: DbdPort: "{{ slurm_dbd_port }}" gres: AutoDetect: nvml + acct_gather: {} + helpers: {} + job_container: {} + mpi: {} + oci: {} + topology: {} + burst_buffer: {} diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index fdf461f88c..91de036c3f 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -50,6 +50,7 @@ - configs_input is defined - configs_input - item.value is string + - item.key in conf_files - name: Build parsed_configs_input dictionary from parsed files ansible.builtin.set_fact: @@ -148,7 +149,7 @@ loop_control: loop_var: product -- name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS +- name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd ansible.builtin.set_fact: conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}" @@ -162,6 +163,17 @@ remote_src: "{{ copy_from_oim }}" loop: "{{ merged_conf.results }}" register: ctld_conf_files + when: + - item.ini_lines + +- name: Add extra confs which are not handled + ansible.builtin.include_tasks: handle_extra_confs.yml + when: + - configs_input is defined + - configs_input.keys() | difference(conf_files) | length > 0 + loop: "{{ configs_input.keys() | difference(conf_files) }}" + loop_control: + loop_var: extra_conf - name: Check if cluster running ansible.builtin.include_tasks: check_ctld_running.yml diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml new file mode 100644 index 0000000000..c7f1ae5bd5 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml @@ -0,0 +1,35 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Add extra confs which are not handled + slurm_conf: + op: merge + conf_sources: "{{ [configs_input[extra_conf]] }}" + conf_name: "{{ extra_conf }}" + register: ex_conf + delegate_to: localhost + when: + - "'.' not in extra_conf" + +- name: Write merged .conf + ansible.builtin.copy: + content: "{{ ex_conf.ini_lines | join('\n') }}\n" + dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ extra_conf }}.conf" + mode: "{{ conf_file_mode }}" + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + remote_src: "{{ copy_from_oim }}" + when: + - "'.' not in extra_conf" + - ex_conf is success diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 939e3ac204..89166b1f12 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -21,6 +21,34 @@ conf_files: # Must match this MASTER list - slurmdbd - cgroup - gres + - acct_gather + - helpers + - job_container + - mpi + - oci + - topology + - burst_buffer + +# Supported configuration files are: + # slurm.conf + # slurmdbd.conf + # cgroup.conf + # gres.conf + # acct_gather.conf + # helpers.conf + # job_container.conf + # mpi.conf + # oci.conf + # topology.conf + # burst_buffer.conf + +# Non Conf files + # topology.yaml + # namespace.yaml + # plugstack.conf + # scrun.lua + # cli_filter.lua + copy_from_oim: false common_dir: - /etc/munge From 7db818aa3a7619cdf074335008a081ab7fe18972 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Mon, 9 Feb 2026 19:06:57 +0530 Subject: [PATCH 092/172] Lint fix --- discovery/roles/slurm_config/tasks/handle_extra_confs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml index c7f1ae5bd5..307ca01723 100644 --- a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml +++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml @@ -19,7 +19,7 @@ conf_name: "{{ extra_conf }}" register: ex_conf delegate_to: localhost - when: + when: - "'.' not in extra_conf" - name: Write merged .conf @@ -30,6 +30,6 @@ owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" - when: + when: - "'.' not in extra_conf" - ex_conf is success From 919b5d379bf4dab07a7962f82b9bfa231d4ec8fc Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 9 Feb 2026 19:33:26 +0530 Subject: [PATCH 093/172] fix for variable scope --- .../roles/validate_input/tasks/main.yml | 14 +++++++------- .../roles/validate_input/vars/main.yml | 9 ++++++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/input_validation/roles/validate_input/tasks/main.yml b/input_validation/roles/validate_input/tasks/main.yml index 6a1c773ee5..de6e9f48e9 100644 --- a/input_validation/roles/validate_input/tasks/main.yml +++ b/input_validation/roles/validate_input/tasks/main.yml @@ -17,12 +17,12 @@ omnia_run_tags: "{{ ansible_run_tags | default([]) }}" when: omnia_run_tags is not defined +- name: Set validation messages + ansible.builtin.set_fact: + validation_success_msg: "{{ messages.validation_success }}" + validation_error_msg: "{{ messages.validation_error }}" + - name: Validate omnia input config - vars: - # Note: When running a specific playbook without tags ansible run tags will default to ["all"], thus if two or more tags are present - # then the "all" tag should be removed so that only the config files related to that playbook are validated. - input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2 - else omnia_run_tags | default([]) }}" block: - name: Run validation validate_input: @@ -35,8 +35,8 @@ - name: Debug validation status ansible.builtin.debug: - msg: "{{ messages.validation_success }}" + msg: "{{ validation_success_msg }}" rescue: - name: Failed due to validation failure ansible.builtin.fail: - msg: "{{ messages.validation_error }}" + msg: "{{ validation_error_msg }}" diff --git a/input_validation/roles/validate_input/vars/main.yml b/input_validation/roles/validate_input/vars/main.yml index 4655e7b25a..698eb4da29 100644 --- a/input_validation/roles/validate_input/vars/main.yml +++ b/input_validation/roles/validate_input/vars/main.yml @@ -16,8 +16,11 @@ input_dir: "{{ hostvars['localhost']['input_project_dir'] }}" project_name: "{{ hostvars['localhost']['project_name'] }}" +# Note: When running a specific playbook without tags ansible run tags will default to ["all"], thus if two or more tags are present +# then the "all" tag should be removed so that only the config files related to that playbook are validated. +input_validate_tags: "{{ omnia_run_tags | default([]) | difference(['all']) if (omnia_run_tags | length) >= 2 + else omnia_run_tags | default([]) }}" + messages: validation_success: "Successfully validated Omnia input config file(s)" - validation_error: > - Input validation failed. - For detailed validation errors, see: {{ ansible_failed_result.log_file }} + validation_error: "Input validation failed. Please check the validation output above for detailed error information." From eb0ce8e93f6145e084769224403c8a2e3d0503a3 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 9 Feb 2026 20:27:31 +0530 Subject: [PATCH 094/172] Update main.yml --- utils/roles/external_kafka_connect_details/tasks/main.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index 207d93bfe6..6a387b1b46 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -19,6 +19,14 @@ changed_when: false failed_when: kubectl_check.rc != 0 +- name: Delete Kafka output directory (clean start) + ansible.builtin.file: + path: "{{ kafka_output_dir }}" + state: absent + delegate_to: localhost + connection: local + run_once: true + - name: Get Kafka pod status ansible.builtin.command: >- kubectl get pods -n {{ kafka_namespace }} From 42e552c6e61e1076231b4ba6ecd8a9b20717ecd5 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Mon, 9 Feb 2026 20:36:52 +0530 Subject: [PATCH 095/172] pylint fixes Signed-off-by: pullan1 --- common/library/modules/pulp_cleanup.py | 192 ++++++++++++------------- 1 file changed, 96 insertions(+), 96 deletions(-) diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index 217ca9b308..72fd11d692 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -28,8 +28,8 @@ import glob import json import subprocess -import time -from datetime import datetime +#import time +#from datetime import datetime from typing import Dict, List, Any, Tuple from ansible.module_utils.basic import AnsibleModule @@ -61,7 +61,7 @@ def format_pretty_table(results: List[Dict[str, Any]]) -> str: return "No cleanup results to display" headers = ["Name", "Type", "Status", "Message"] - + # Calculate column widths widths = [len(h) for h in headers] for r in results: @@ -73,9 +73,9 @@ def format_pretty_table(results: List[Dict[str, Any]]) -> str: # Build table border = "+" + "+".join("-" * (w + 2) for w in widths) + "+" header_row = "|" + "|".join(f" {h.ljust(w)} " for h, w in zip(headers, widths)) + "|" - + lines = [border, header_row, border] - + for r in results: msg = str(r.get('message', ''))[:40] row = "|" + "|".join([ @@ -86,7 +86,7 @@ def format_pretty_table(results: List[Dict[str, Any]]) -> str: f" {msg.ljust(widths[3])} " ]) + "|" lines.append(row) - + lines.append(border) return "\n".join(lines) @@ -100,7 +100,7 @@ def run_cmd(cmd: str, logger) -> Dict[str, Any]: try: result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=300) return {"rc": result.returncode, "stdout": result.stdout, "stderr": result.stderr} - except Exception as e: + except (subprocess.SubprocessError, OSError) as e: logger.error(f"Command failed: {cmd} - {e}") return {"rc": 1, "stdout": "", "stderr": str(e)} @@ -112,7 +112,7 @@ def safe_json_parse(data: str, default: Any = None) -> Any: """ if not data or not isinstance(data, str): return default if default is not None else [] - + try: decoder = json.JSONDecoder() parsed, _ = decoder.raw_decode(data.strip()) @@ -135,19 +135,19 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]: """ if not image_name: return False, "Container image name cannot be empty" - + # Must contain at least one '/' to indicate registry/image format if '/' not in image_name: return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)" - + # Must have a registry part (contains '.' or is a known registry) parts = image_name.split('/') registry = parts[0] - + # Check if registry looks valid (contains dot or is localhost) if '.' not in registry and registry != 'localhost' and ':' not in registry: return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)" - + return True, "" @@ -209,7 +209,7 @@ def file_exists_in_status(name: str, base_path: str, logger) -> bool: """Check if file artifact exists in status files.""" try: for status_file in glob.glob(f"{base_path}/x86_64/*/status.csv"): - with open(status_file, 'r') as f: + with open(status_file, 'r', encoding='utf-8') as f: if name in f.read(): return True return False @@ -234,12 +234,12 @@ def get_all_repositories(logger) -> List[str]: def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]: """Cleanup a single RPM repository.""" result = {"name": name, "type": "repository", "status": "Failed", "message": ""} - + # Check existence if not repo_exists(name, logger): result["message"] = "Repository not found" return result - + try: # Delete distributions dist_list = run_cmd(pulp_rpm_commands["list_distributions"], logger) @@ -248,20 +248,20 @@ def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]: for d in dists: if d.get('name', '') == name or name in d.get('name', ''): run_cmd(pulp_rpm_commands["delete_distribution"] % d.get('name', ''), logger) - + # Delete publications pub_list = run_cmd(pulp_rpm_commands["list_publications"] % name, logger) if pub_list["rc"] == 0: pubs = safe_json_parse(pub_list["stdout"]) for p in pubs: run_cmd(pulp_rpm_commands["delete_publication"] % p.get('pulp_href', ''), logger) - + # Delete remote run_cmd(pulp_rpm_commands["delete_remote"] % name, logger) - + # Delete repository del_result = run_cmd(pulp_rpm_commands["delete_repository"] % name, logger) - + if del_result["rc"] == 0: result["status"] = "Success" result["message"] = "Repository deleted" @@ -271,10 +271,10 @@ def cleanup_repository(name: str, base_path: str, logger) -> Dict[str, Any]: mark_software_partial(affected, base_path, logger, 'repository') else: result["message"] = f"Delete failed: {del_result['stderr']}" - + except Exception as e: result["message"] = f"Error: {str(e)}" - + return result @@ -285,21 +285,21 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] user_input: User-provided image name (e.g., registry.k8s.io/pause) """ result = {"name": user_input, "type": "container", "status": "Failed", "message": ""} - + # Validate format is_valid, error_msg = validate_container_format(user_input) if not is_valid: result["message"] = error_msg return result - + # Convert to Pulp naming convention pulp_name = convert_to_pulp_container_name(user_input) - + # Check existence if not container_exists(pulp_name, logger): result["message"] = f"Container not found in Pulp (looked for: {pulp_name})" return result - + try: # Delete distributions dist_list = run_cmd(pulp_container_commands["list_distributions"], logger) @@ -308,10 +308,10 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] for d in dists: if d.get('name', '') == pulp_name: run_cmd(pulp_container_commands["delete_distribution"] % d.get('name', ''), logger) - + # Delete repository del_result = run_cmd(pulp_container_commands["delete_repository"] % pulp_name, logger) - + if del_result["rc"] == 0: result["status"] = "Success" result["message"] = "Container deleted" @@ -320,10 +320,10 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] mark_software_partial(affected, base_path, logger, 'image') else: result["message"] = f"Delete failed: {del_result['stderr']}" - + except Exception as e: result["message"] = f"Error: {str(e)}" - + return result @@ -338,7 +338,7 @@ def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]: repo_list = run_cmd(pulp_file_commands["list_repositories"], logger) if repo_list["rc"] != 0: return False, "", "" - + repos = safe_json_parse(repo_list["stdout"]) for repo in repos: repo_name = repo.get('name', '') @@ -351,9 +351,9 @@ def file_exists_in_pulp(name: str, logger) -> Tuple[bool, str, str]: contents = safe_json_parse(content_list["stdout"]) if contents: return True, repo_name, contents[0].get('pulp_href', '') - + return False, "", "" - except Exception: + except (OSError, ValueError): return False, "", "" @@ -365,7 +365,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) """ try: messages = [] - + # 1. Remove content from repository if content_href: remove_result = run_cmd( @@ -380,7 +380,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", logger ) - + # 2. Delete distribution if exists dist_result = run_cmd(pulp_file_commands["list_distributions"], logger) if dist_result["rc"] == 0: @@ -389,14 +389,14 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) if d.get('name', '') == name or name in d.get('name', ''): run_cmd(pulp_file_commands["delete_distribution"] % d.get('name', ''), logger) messages.append("Distribution deleted") - + # 3. Try to delete the file repository if it's named after the artifact repo_del = run_cmd(pulp_file_commands["delete_repository"] % name, logger) if repo_del["rc"] == 0: messages.append("Repository deleted") - + return True, "; ".join(messages) if messages else "Removed from Pulp" - + except Exception as e: return False, f"Pulp deletion error: {str(e)}" @@ -410,7 +410,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""} messages = [] pulp_deleted = False - + try: # Pulp Python repo name format: pip_module # User input could be "cffi==1.17.1" or "pip_modulecffi==1.17.1" @@ -418,24 +418,24 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: pulp_repo_name = name else: pulp_repo_name = f"pip_module{name}" - + logger.info(f"Looking for Python repository: {pulp_repo_name}") - + # Check if repository exists repo_check = run_cmd(pulp_python_commands["show_repository"] % pulp_repo_name, logger) - + if repo_check["rc"] == 0: # Delete distribution first dist_del = run_cmd(pulp_python_commands["delete_distribution"] % pulp_repo_name, logger) if dist_del["rc"] == 0: messages.append("Distribution deleted") - + # Delete repository repo_del = run_cmd(pulp_python_commands["delete_repository"] % pulp_repo_name, logger) if repo_del["rc"] == 0: pulp_deleted = True messages.append("Repository deleted") - + # Run orphan cleanup if pulp_deleted: logger.info("Running orphan cleanup...") @@ -451,33 +451,33 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: repo_name = repo.get('name', '') if name in repo_name or repo_name == pulp_repo_name: logger.info(f"Found matching Python repository: {repo_name}") - + dist_del = run_cmd(pulp_python_commands["delete_distribution"] % repo_name, logger) if dist_del["rc"] == 0: messages.append("Distribution deleted") - + repo_del = run_cmd(pulp_python_commands["delete_repository"] % repo_name, logger) if repo_del["rc"] == 0: pulp_deleted = True messages.append("Repository deleted") break - + # Update status files if file_exists_in_status(name, base_path, logger): affected = remove_from_status_files(name, 'pip_module', base_path, logger) if affected: messages.append("Status files updated") mark_software_partial(affected, base_path, logger, 'pip_module') - + if pulp_deleted: result["status"] = "Success" result["message"] = "; ".join(messages) if messages else "Cleaned up" else: result["message"] = f"pip_module '{name}' not found in Pulp" - + except Exception as e: result["message"] = f"Error: {str(e)}" - + return result @@ -505,21 +505,21 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - messages = [] pulp_deleted = False status_removed = False - + try: # Get the expected Pulp repository name pulp_repo_name = get_pulp_file_repo_name(name, file_type) logger.info(f"Looking for {file_type} repository: {pulp_repo_name}") - + # Check if repository exists directly repo_check = run_cmd(pulp_file_commands["show_repository"] % pulp_repo_name, logger) - + if repo_check["rc"] == 0: # Found exact match - delete distribution and repository dist_del = run_cmd(pulp_file_commands["delete_distribution"] % pulp_repo_name, logger) if dist_del["rc"] == 0: messages.append("Distribution deleted") - + repo_del = run_cmd(pulp_file_commands["delete_repository"] % pulp_repo_name, logger) if repo_del["rc"] == 0: pulp_deleted = True @@ -533,17 +533,17 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - repo_name = repo.get('name', '') if name in repo_name or repo_name == pulp_repo_name: logger.info(f"Found matching repository: {repo_name}") - + dist_del = run_cmd(pulp_file_commands["delete_distribution"] % repo_name, logger) if dist_del["rc"] == 0: messages.append("Distribution deleted") - + repo_del = run_cmd(pulp_file_commands["delete_repository"] % repo_name, logger) if repo_del["rc"] == 0: pulp_deleted = True messages.append("Repository deleted") break - + # Run orphan cleanup to remove actual content files if pulp_deleted: logger.info("Running orphan cleanup to remove content files...") @@ -552,7 +552,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - messages.append("Orphan cleanup completed") else: logger.warning(f"Orphan cleanup warning: {orphan_result['stderr']}") - + # Update status files if file_exists_in_status(name, base_path, logger): affected = remove_from_status_files(name, file_type, base_path, logger) @@ -560,17 +560,17 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - status_removed = True messages.append("Status files updated") mark_software_partial(affected, base_path, logger, file_type) - + # Determine overall result if pulp_deleted or status_removed: result["status"] = "Success" result["message"] = "; ".join(messages) if messages else "Cleaned up" else: result["message"] = f"{file_type} '{name}' not found in Pulp or status files" - + except Exception as e: result["message"] = f"Error: {str(e)}" - + return result @@ -582,11 +582,11 @@ def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]: - tarball, git, manifest, ansible_galaxy_collection: Pulp File repository """ file_type = detect_file_type(name) - + # Handle pip modules separately - they use Python repositories if file_type == "pip_module": return cleanup_pip_module(name, base_path, logger) - + # All other file types use Pulp File repository return cleanup_file_repository(name, file_type, base_path, logger) @@ -616,20 +616,20 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[ rows = [] removed = False has_repo_column = False - + # Check if file has repo_name column - with open(status_file, 'r') as f: + with open(status_file, 'r', encoding='utf-8') as f: header = f.readline().strip().lower() has_repo_column = "repo_name" in header - - with open(status_file, 'r') as f: + + with open(status_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames for row in reader: name = row.get('name', '') row_type = row.get('type', '') rpm_repo = row.get('repo_name', '') - + logger.info(f"Processing row: {row}") # For RPMs, check if they belong to the deleted repository if row_type == 'rpm': @@ -640,18 +640,18 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[ rows.append(row) else: rows.append(row) - + if removed and fieldnames: - with open(status_file, 'w', newline='') as f: + with open(status_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) - + # Track affected software software_name = os.path.basename(os.path.dirname(status_file)) if software_name not in affected_software: affected_software.append(software_name) - + return affected_software except Exception as e: logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}") @@ -676,7 +676,7 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): rows = [] removed = False - with open(status_file, 'r') as f: + with open(status_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames for row in reader: @@ -696,13 +696,13 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: logger.info(f"Removing '{name}' from {status_file}") else: rows.append(row) - + if removed and fieldnames: - with open(status_file, 'w', newline='') as f: + with open(status_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) - + # Track affected software software_name = os.path.basename(os.path.dirname(status_file)) if software_name not in arch_affected: @@ -710,10 +710,10 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: if arch_affected: affected_software[arch] = arch_affected - - logger.info(f"remove_from_status_files returning: {affected_software}") + + logger.info(f"remove_from_status_files returning: {affected_software}") return affected_software - except Exception as e: + except OSError as e: logger.error(f"Failed to remove from status files: {e}") return {} @@ -738,7 +738,7 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES} else: arch_software_map = affected_software - + try: for arch, software_names in arch_software_map.items(): if not software_names: @@ -752,7 +752,7 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty rows = [] updated = False - with open(software_file, 'r') as f: + with open(software_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames for row in reader: @@ -761,14 +761,14 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty updated = True logger.info(f"Marked '{row.get('name')}' as partial in {arch}/software.csv ({artifact_type} cleanup)") rows.append(row) - + if fieldnames and rows and updated: - with open(software_file, 'w', newline='') as f: + with open(software_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) logger.info(f"Successfully wrote updated software.csv for {arch}") - except Exception as e: + except OSError as e: logger.error(f"Failed to update software.csv: {e}") def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> bool: @@ -786,15 +786,15 @@ def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> status_file = f"{base_path}/{arch}/{software_name}/status.csv" if not os.path.exists(status_file): return False - + try: - with open(status_file, 'r') as f: + with open(status_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: if row.get('type', '').lower() == 'rpm': return True return False - except Exception as e: + except OSError as e: logger.error(f"Error checking RPMs for {software_name}: {e}") return False @@ -815,14 +815,14 @@ def mark_all_software_partial(base_path: str, logger): for arch in ARCH_SUFFIXES: software_file = f"{base_path}/{arch}/software.csv" logger.info(f"Processing software file: {software_file}") - + if not os.path.exists(software_file): logger.info(f"Software file not found: {software_file}") continue - + rows = [] updated = False - with open(software_file, 'r') as f: + with open(software_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames for row in reader: @@ -836,26 +836,26 @@ def mark_all_software_partial(base_path: str, logger): else: logger.info(f"Skipping '{software_name}' - no RPM dependencies") rows.append(row) - + if fieldnames and rows and updated: - with open(software_file, 'w', newline='') as f: + with open(software_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) logger.info(f"Successfully updated {software_file}") - except Exception as e: + except OSError as e: logger.error(f"Failed to mark all software as partial: {e}") def write_cleanup_status(results: List[Dict], base_path: str): """Write cleanup results to status file.""" status_file = f"{base_path}/cleanup_status.csv" os.makedirs(os.path.dirname(status_file), exist_ok=True) - - with open(status_file, 'w', newline='') as f: + + with open(status_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['name', 'type', 'status', 'message']) writer.writeheader() writer.writerows(results) - + return status_file @@ -884,7 +884,7 @@ def run_module(): log_dir = os.path.join(base_path, "cleanup") os.makedirs(base_path, exist_ok=True) logger = setup_standard_logger(log_dir) - + # Handle 'all' keyword for repositories only cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all' #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all': From 2f452e9f543e10a8c1abbbb457f87e43f12f1a56 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 9 Feb 2026 22:02:38 +0530 Subject: [PATCH 096/172] kafka and victoria update --- .../tasks/main.yml | 30 +++++++----- .../vars/main.yml | 12 +++++ .../tasks/main.yml | 46 ++++++++++++++----- .../vars/main.yml | 19 ++++++++ 4 files changed, 83 insertions(+), 24 deletions(-) diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index 6a387b1b46..a99fba33b3 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -181,21 +181,27 @@ [ 'Kafka connection details written to: ' ~ kafka_output_file, '', - 'Kafka external endpoint: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, + '[IMPORTANT] Kafka external endpoint: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, '', - 'TLS:', - ' CA: ' ~ kafka_output_dir ~ '/ca.crt', + '[IMPORTANT] TLS files (on OIM host):', + ' CA (server certificate for OME): ' ~ kafka_output_dir ~ '/ca.crt', ' client cert: ' ~ kafka_output_dir ~ '/user.crt', - ' client key: ' ~ kafka_output_dir ~ '/user.key', + ' client key: ' ~ kafka_output_dir ~ '/user.key', '', - 'OME note (mTLS):', - ' Use ca.crt as the server certificate in OME.', - ' Create a client certificate in .pfx format (provide a passphrase when prompted):', - ' cd ' ~ kafka_output_dir, - ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', - ' Use user.pfx as the client certificate in OME.', - ' If you are using the OME UI from a different system than the OIM host,', - ' copy ca.crt and user.pfx from the OIM host to that system before selecting/uploading them in the UI.', + 'OME steps (mTLS):', + ' [STEP 1] Create client certificate in .pfx format (passphrase required):', + ' cd ' ~ kafka_output_dir, + ' openssl pkcs12 -export -out user.pfx -inkey user.key -in user.crt', + ' [STEP 2] ' ~ kafka_ome_cross_machine_note_line1, + ' ' ~ kafka_ome_cross_machine_note_line2, + ' [STEP 3] In the OME UI, navigate to:', + ' ' ~ kafka_ome_ui_navigation_line1, + ' ' ~ kafka_ome_ui_navigation_line2, + ' [STEP 4] Click: ' ~ kafka_ome_ui_enable_label, + ' [STEP 5] Set Kafka Bootstrap Server to: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, + ' [STEP 6] Set Authentication Mode to: ' ~ kafka_ome_auth_mode_value, + ' [STEP 7] ' ~ kafka_ome_server_cert_note, + ' [STEP 8] ' ~ kafka_ome_client_cert_note, '' ] }} diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml index d0bd070d47..295a9ad5df 100644 --- a/utils/roles/external_kafka_connect_details/vars/main.yml +++ b/utils/roles/external_kafka_connect_details/vars/main.yml @@ -28,3 +28,15 @@ kafka_err_pods_not_ready: "One or more Kafka pods are not Ready." kafka_err_external_ip_missing: >- Failed to fetch Kafka LoadBalancer external IP. Ensure service '{{ kafka_lb_service_name }}' exists in namespace '{{ kafka_namespace }}' and has an external IP assigned. + +kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity" +kafka_ome_ui_navigation_line2: "Remote Telemetry Configuration -> Kafka Connectivity" +kafka_ome_ui_enable_label: "Enable Kafka Connectivity" +kafka_ome_auth_mode_value: "SSL" + +kafka_ome_server_cert_note: "Upload ca.crt as the server certificate in OME." +kafka_ome_client_cert_note: "Upload user.pfx as the client certificate in OME (mTLS)." +kafka_ome_cross_machine_note_line1: >- + If OME UI is accessed from a different system than the OIM host, +kafka_ome_cross_machine_note_line2: >- + copy ca.crt and user.pfx to that system before uploading them in the UI. diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 38b0ce3045..14e83f7688 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -164,8 +164,7 @@ vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}" vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}" victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt" - victoria_tls_cert: "{{ victoria_tls_cert_dir }}/server.crt" - victoria_tls_key: "{{ victoria_tls_cert_dir }}/server.key" + - name: Fail when LoadBalancer IPs are not available ansible.builtin.fail: @@ -238,12 +237,22 @@ query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" tls: - server_crt: "{{ victoria_tls_cert }}" + ca_crt: "{{ victoria_tls_ca }}" notes: sfm: vminsert_write_url: "{{ victoria_vminsert_write_url }}" hosts_entry: "{{ victoria_sfm_hosts_entry }}" hosts_entry_vmselect: "{{ victoria_sfm_hosts_entry_vmselect }}" + ui_navigation: "{{ victoria_sfm_ui_navigation }}" + remote_write_target_name: "{{ victoria_sfm_remote_write_target_name }}" + remote_write_message_version: "{{ victoria_sfm_remote_write_message_version }}" + remote_write_enable_value: "{{ victoria_sfm_remote_write_enable_value }}" + tls_server_cert_file_name: "{{ victoria_sfm_tls_server_cert_file_name }}" + tls_server_cert_file_path: "{{ victoria_tls_ca }}" + ssh_note: "{{ victoria_sfm_ssh_note }}" + hosts_scope_note: "{{ victoria_sfm_hosts_scope_note }}" + pod_shell_command_example: "{{ victoria_sfm_pod_shell_command_example }}" + hosts_restart_note: "{{ victoria_sfm_hosts_restart_note }}" - name: Ensure output directory exists ansible.builtin.file: @@ -273,20 +282,33 @@ 'Mode: ' ~ victoria_deployment_mode, '', 'Endpoints:', - ' vminsert write: ' ~ victoria_vminsert_write_url, + ' [IMPORTANT] vminsert write: ' ~ victoria_vminsert_write_url, ' vmselect query: ' ~ victoria_vmselect_query_url, ' vmselect UI: ' ~ victoria_vmselect_ui_url, '', 'TLS:', - ' server.crt: ' ~ victoria_tls_cert, + ' ca.crt: ' ~ victoria_tls_ca, '', - 'SFM note:', - ' Use vminsert write URL for SFM: ' ~ victoria_vminsert_write_url, - ' Add these entries to /etc/hosts on the SFM server:', - ' ' ~ victoria_sfm_hosts_entry_vminsert_display, - ' ' ~ victoria_sfm_hosts_entry_vmselect_display, - ' If you are using the SFM UI from a different system than the OIM host,', - ' copy server.crt from the OIM host to that system before selecting/uploading it in the UI.' + 'SFM steps (TLS):', + ' [STEP 1] ' ~ victoria_sfm_cross_machine_tls_note_line1, + ' ' ~ victoria_sfm_cross_machine_tls_note_line2, + ' [STEP 2] In the SFM UI, update the vminsert URL:', + ' ' ~ victoria_sfm_ui_navigation, + ' Edit target: ' ~ victoria_sfm_remote_write_target_name, + ' Set Enable to: ' ~ victoria_sfm_remote_write_enable_value, + ' Set URL to: ' ~ victoria_vminsert_write_url, + ' Set Message Version to: ' ~ victoria_sfm_remote_write_message_version, + ' TLS Config: Upload ' ~ victoria_sfm_tls_server_cert_file_name, + ' as ' ~ victoria_sfm_tls_server_cert_file_label ~ ': ' ~ victoria_tls_ca, + ' [STEP 3] ' ~ victoria_sfm_ssh_note, + ' [STEP 4] Update /etc/hosts only inside the SFM Prometheus pod:', + ' ' ~ victoria_sfm_hosts_scope_note, + ' ' ~ victoria_sfm_pod_shell_command_example, + ' Add these entries inside the pod:', + ' ' ~ victoria_sfm_hosts_entry_vminsert_display, + ' ' ~ victoria_sfm_hosts_entry_vmselect_display, + ' [NOTE] ' ~ victoria_sfm_hosts_restart_note, + '' ] }} delegate_to: localhost diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml index ea1c083deb..c033adaa1c 100644 --- a/utils/roles/external_victoria_connect_details/vars/main.yml +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -29,3 +29,22 @@ victoria_err_pods_not_ready: "One or more Victoria pods are not Ready." victoria_err_lb_missing: >- Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. + +victoria_sfm_ui_navigation: "Observability -> Settings -> Prometheus Remote Write" +victoria_sfm_remote_write_target_name: "victoria" +victoria_sfm_remote_write_message_version: "v1" +victoria_sfm_remote_write_enable_value: "ON" + +victoria_sfm_ssh_note: "SSH to the SFM IP with admin credentials." +victoria_sfm_hosts_scope_note: >- + /etc/hosts update is required only inside the SFM Prometheus pod (not on the SFM server host). +victoria_sfm_pod_shell_command_example: >- + kubectl exec -it sfm-prometheus-deployment-xxxxx-xx -n sfm-1 -- /bin/sh +victoria_sfm_hosts_restart_note: "Repeat /etc/hosts update if the SFM pod restarts." +victoria_sfm_cross_machine_tls_note_line1: >- + If using the SFM UI from a different system than the OIM host, +victoria_sfm_cross_machine_tls_note_line2: >- + copy ca.crt to that system before uploading it in the UI. + +victoria_sfm_tls_server_cert_file_label: "Server Certificate File" +victoria_sfm_tls_server_cert_file_name: "ca.crt" From 24734898f1b9987dcff6ab8ef2f370a3f03c7cba Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 9 Feb 2026 22:05:03 +0530 Subject: [PATCH 097/172] update kafka --- utils/roles/external_kafka_connect_details/tasks/main.yml | 1 - utils/roles/external_kafka_connect_details/vars/main.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index a99fba33b3..96c6d0ca5f 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -196,7 +196,6 @@ ' ' ~ kafka_ome_cross_machine_note_line2, ' [STEP 3] In the OME UI, navigate to:', ' ' ~ kafka_ome_ui_navigation_line1, - ' ' ~ kafka_ome_ui_navigation_line2, ' [STEP 4] Click: ' ~ kafka_ome_ui_enable_label, ' [STEP 5] Set Kafka Bootstrap Server to: ' ~ kafka_external_ip ~ ':' ~ kafka_external_port, ' [STEP 6] Set Authentication Mode to: ' ~ kafka_ome_auth_mode_value, diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml index 295a9ad5df..ff257328a8 100644 --- a/utils/roles/external_kafka_connect_details/vars/main.yml +++ b/utils/roles/external_kafka_connect_details/vars/main.yml @@ -30,7 +30,6 @@ kafka_err_external_ip_missing: >- exists in namespace '{{ kafka_namespace }}' and has an external IP assigned. kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity" -kafka_ome_ui_navigation_line2: "Remote Telemetry Configuration -> Kafka Connectivity" kafka_ome_ui_enable_label: "Enable Kafka Connectivity" kafka_ome_auth_mode_value: "SSL" From 3db2b320e90abebd38fdc6d46fa77a363d7d4a13 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 9 Feb 2026 22:10:52 +0530 Subject: [PATCH 098/172] Update main.yml --- utils/roles/external_victoria_connect_details/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 14e83f7688..e06b061828 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -176,7 +176,7 @@ ansible.builtin.set_fact: victoria_sfm_hosts_entry: >- {{ - 'echo "' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts' + 'echo ' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' if (vminsert_lb_ip.stdout | trim | length) > 0 else '' }} @@ -185,7 +185,7 @@ ansible.builtin.set_fact: victoria_sfm_hosts_entry_vmselect: >- {{ - 'echo "' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local" >> /etc/hosts' + 'echo ' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' if (vmselect_lb_ip.stdout | trim | length) > 0 else '' }} From 6cfcb7b7fd67f8633480bd359b19e95c59b77532 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Tue, 10 Feb 2026 09:56:35 +0530 Subject: [PATCH 099/172] copyright info updated Signed-off-by: pullan1 --- common/library/modules/process_rpm_config.py | 2 +- common/library/modules/pulp_cleanup.py | 4 +--- local_repo/pulp_cleanup.yml | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/common/library/modules/process_rpm_config.py b/common/library/modules/process_rpm_config.py index 89a8f0e1ca..550d0c078f 100644 --- a/common/library/modules/process_rpm_config.py +++ b/common/library/modules/process_rpm_config.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index 72fd11d692..6f80e82f83 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,8 +28,6 @@ import glob import json import subprocess -#import time -#from datetime import datetime from typing import Dict, List, Any, Tuple from ansible.module_utils.basic import AnsibleModule diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index f999b3a2dc..5d409bbc1f 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 7790c2d1b238464db4fb41a62a7fcd61172df965 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 10 Feb 2026 05:39:01 +0000 Subject: [PATCH 100/172] fixed lint issues --- .../tasks/extract_path_overrides.yml | 118 ++++++++++++++---- 1 file changed, 96 insertions(+), 22 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml index 45565dc4e7..ab1bf17aa6 100644 --- a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml +++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml @@ -34,21 +34,64 @@ - name: Extract effective controller directories from slurm.conf ansible.builtin.set_fact: - slurm_ctld_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldLogFile', ['/var/log/slurm/slurmctld.log']) | first if slurm_merged_dict.get('SlurmctldLogFile') is iterable and slurm_merged_dict.get('SlurmctldLogFile') is not string else slurm_merged_dict.get('SlurmctldLogFile', '/var/log/slurm/slurmctld.log')) | dirname }}" - slurm_state_save_location_effective: "{{ (slurm_merged_dict.get('StateSaveLocation', ['/var/spool/slurmctld']) | first if slurm_merged_dict.get('StateSaveLocation') is iterable and slurm_merged_dict.get('StateSaveLocation') is not string else slurm_merged_dict.get('StateSaveLocation', '/var/spool/slurmctld')) }}" - slurm_ctld_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmctldPidFile', ['/var/run/slurmctld.pid']) | first if slurm_merged_dict.get('SlurmctldPidFile') is iterable and slurm_merged_dict.get('SlurmctldPidFile') is not string else slurm_merged_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid')) | dirname }}" - slurm_sched_log_dir_effective: "{{ ((slurm_merged_dict.get('SlurmSchedLogFile', ['']) | first if slurm_merged_dict.get('SlurmSchedLogFile') is iterable and slurm_merged_dict.get('SlurmSchedLogFile') is not string else slurm_merged_dict.get('SlurmSchedLogFile', '')) | default('', true) | dirname | default('', true)) }}" + slurm_ctld_log_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmctldLogFile', ['/var/log/slurm/slurmctld.log']) + | first if slurm_merged_dict.get('SlurmctldLogFile') is iterable + and slurm_merged_dict.get('SlurmctldLogFile') is not string + else slurm_merged_dict.get('SlurmctldLogFile', '/var/log/slurm/slurmctld.log')) + | dirname }} + slurm_state_save_location_effective: >- + {{ (slurm_merged_dict.get('StateSaveLocation', ['/var/spool/slurmctld']) + | first if slurm_merged_dict.get('StateSaveLocation') is iterable + and slurm_merged_dict.get('StateSaveLocation') is not string + else slurm_merged_dict.get('StateSaveLocation', '/var/spool/slurmctld')) }} + slurm_ctld_pid_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmctldPidFile', ['/var/run/slurmctld.pid']) + | first if slurm_merged_dict.get('SlurmctldPidFile') is iterable + and slurm_merged_dict.get('SlurmctldPidFile') is not string + else slurm_merged_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid')) + | dirname }} + slurm_sched_log_dir_effective: >- + {{ ((slurm_merged_dict.get('SlurmSchedLogFile', ['']) + | first if slurm_merged_dict.get('SlurmSchedLogFile') is iterable + and slurm_merged_dict.get('SlurmSchedLogFile') is not string + else slurm_merged_dict.get('SlurmSchedLogFile', '')) + | default('', true) | dirname | default('', true)) }} when: slurm_merged_dict is defined # ── slurm.conf: compute path params ────────────────────────────────── - name: Extract effective compute directories from slurm.conf ansible.builtin.set_fact: - slurm_slurmd_log_dir_effective: "{{ (slurm_merged_dict.get('SlurmdLogFile', ['/var/log/slurm/slurmd.log']) | first if slurm_merged_dict.get('SlurmdLogFile') is iterable and slurm_merged_dict.get('SlurmdLogFile') is not string else slurm_merged_dict.get('SlurmdLogFile', '/var/log/slurm/slurmd.log')) | dirname }}" - slurm_slurmd_spool_dir_effective: "{{ (slurm_merged_dict.get('SlurmdSpoolDir', ['/var/spool/slurmd']) | first if slurm_merged_dict.get('SlurmdSpoolDir') is iterable and slurm_merged_dict.get('SlurmdSpoolDir') is not string else slurm_merged_dict.get('SlurmdSpoolDir', '/var/spool/slurmd')) }}" - slurm_slurmd_pid_dir_effective: "{{ (slurm_merged_dict.get('SlurmdPidFile', ['/var/run/slurmd.pid']) | first if slurm_merged_dict.get('SlurmdPidFile') is iterable and slurm_merged_dict.get('SlurmdPidFile') is not string else slurm_merged_dict.get('SlurmdPidFile', '/var/run/slurmd.pid')) | dirname }}" - slurm_epilog_dir_effective: "{{ (slurm_merged_dict.get('Epilog', ['/etc/slurm/epilog.d/logout_user.sh']) | first if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string else slurm_merged_dict.get('Epilog', '/etc/slurm/epilog.d/logout_user.sh')) | dirname }}" - slurm_prolog_dir_effective: "{{ ((slurm_merged_dict.get('Prolog', ['']) | first if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string else slurm_merged_dict.get('Prolog', '')) | default('', true) | dirname | default('', true)) }}" + slurm_slurmd_log_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmdLogFile', ['/var/log/slurm/slurmd.log']) + | first if slurm_merged_dict.get('SlurmdLogFile') is iterable + and slurm_merged_dict.get('SlurmdLogFile') is not string + else slurm_merged_dict.get('SlurmdLogFile', '/var/log/slurm/slurmd.log')) + | dirname }} + slurm_slurmd_spool_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmdSpoolDir', ['/var/spool/slurmd']) + | first if slurm_merged_dict.get('SlurmdSpoolDir') is iterable + and slurm_merged_dict.get('SlurmdSpoolDir') is not string + else slurm_merged_dict.get('SlurmdSpoolDir', '/var/spool/slurmd')) }} + slurm_slurmd_pid_dir_effective: >- + {{ (slurm_merged_dict.get('SlurmdPidFile', ['/var/run/slurmd.pid']) + | first if slurm_merged_dict.get('SlurmdPidFile') is iterable + and slurm_merged_dict.get('SlurmdPidFile') is not string + else slurm_merged_dict.get('SlurmdPidFile', '/var/run/slurmd.pid')) + | dirname }} + slurm_epilog_dir_effective: >- + {{ (slurm_merged_dict.get('Epilog', ['/etc/slurm/epilog.d/logout_user.sh']) + | first if slurm_merged_dict.get('Epilog') is iterable + and slurm_merged_dict.get('Epilog') is not string + else slurm_merged_dict.get('Epilog', '/etc/slurm/epilog.d/logout_user.sh')) + | dirname }} + slurm_prolog_dir_effective: >- + {{ ((slurm_merged_dict.get('Prolog', ['']) + | first if slurm_merged_dict.get('Prolog') is iterable + and slurm_merged_dict.get('Prolog') is not string + else slurm_merged_dict.get('Prolog', '')) + | default('', true) | dirname | default('', true)) }} when: slurm_merged_dict is defined # ── slurm.conf: all epilog/prolog dirs and custom file paths ───────── @@ -56,12 +99,16 @@ - name: Extract all epilog paths from merged Epilog list ansible.builtin.set_fact: slurm_epilog_paths_all: >- - {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string - else [slurm_merged_dict.get('Epilog', '')]) + {{ (slurm_merged_dict.get('Epilog', []) + if slurm_merged_dict.get('Epilog') is iterable + and slurm_merged_dict.get('Epilog') is not string + else [slurm_merged_dict.get('Epilog', '')]) | reject('equalto', '') | list }} slurm_epilog_dirs_all: >- - {{ (slurm_merged_dict.get('Epilog', []) if slurm_merged_dict.get('Epilog') is iterable and slurm_merged_dict.get('Epilog') is not string - else [slurm_merged_dict.get('Epilog', '')]) + {{ (slurm_merged_dict.get('Epilog', []) + if slurm_merged_dict.get('Epilog') is iterable + and slurm_merged_dict.get('Epilog') is not string + else [slurm_merged_dict.get('Epilog', '')]) | map('dirname') | unique | reject('equalto', '') | list }} when: slurm_merged_dict is defined @@ -74,12 +121,16 @@ - name: Extract all prolog paths from merged Prolog list ansible.builtin.set_fact: slurm_prolog_paths_all: >- - {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string - else [slurm_merged_dict.get('Prolog', '')]) + {{ (slurm_merged_dict.get('Prolog', []) + if slurm_merged_dict.get('Prolog') is iterable + and slurm_merged_dict.get('Prolog') is not string + else [slurm_merged_dict.get('Prolog', '')]) | reject('equalto', '') | list }} slurm_prolog_dirs_all: >- - {{ (slurm_merged_dict.get('Prolog', []) if slurm_merged_dict.get('Prolog') is iterable and slurm_merged_dict.get('Prolog') is not string - else [slurm_merged_dict.get('Prolog', '')]) + {{ (slurm_merged_dict.get('Prolog', []) + if slurm_merged_dict.get('Prolog') is iterable + and slurm_merged_dict.get('Prolog') is not string + else [slurm_merged_dict.get('Prolog', '')]) | map('dirname') | unique | reject('equalto', '') | list }} when: slurm_merged_dict is defined @@ -93,23 +144,46 @@ - name: Extract effective plugin directory from slurm.conf ansible.builtin.set_fact: - slurm_plugin_dir_effective: "{{ (slurm_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurm_merged_dict.get('PluginDir') is iterable and slurm_merged_dict.get('PluginDir') is not string else slurm_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}" + slurm_plugin_dir_effective: >- + {{ (slurm_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) + | first if slurm_merged_dict.get('PluginDir') is iterable + and slurm_merged_dict.get('PluginDir') is not string + else slurm_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }} when: slurm_merged_dict is defined # ── slurmdbd.conf path params ──────────────────────────────────────── - name: Extract effective directories from slurmdbd.conf ansible.builtin.set_fact: - slurmdbd_log_dir_effective: "{{ (slurmdbd_merged_dict.get('LogFile', ['/var/log/slurm/slurmdbd.log']) | first if slurmdbd_merged_dict.get('LogFile') is iterable and slurmdbd_merged_dict.get('LogFile') is not string else slurmdbd_merged_dict.get('LogFile', '/var/log/slurm/slurmdbd.log')) | dirname }}" - slurmdbd_pid_dir_effective: "{{ (slurmdbd_merged_dict.get('PidFile', ['/var/run/slurmdbd.pid']) | first if slurmdbd_merged_dict.get('PidFile') is iterable and slurmdbd_merged_dict.get('PidFile') is not string else slurmdbd_merged_dict.get('PidFile', '/var/run/slurmdbd.pid')) | dirname }}" - slurmdbd_plugin_dir_effective: "{{ (slurmdbd_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) | first if slurmdbd_merged_dict.get('PluginDir') is iterable and slurmdbd_merged_dict.get('PluginDir') is not string else slurmdbd_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }}" + slurmdbd_log_dir_effective: >- + {{ (slurmdbd_merged_dict.get('LogFile', ['/var/log/slurm/slurmdbd.log']) + | first if slurmdbd_merged_dict.get('LogFile') is iterable + and slurmdbd_merged_dict.get('LogFile') is not string + else slurmdbd_merged_dict.get('LogFile', '/var/log/slurm/slurmdbd.log')) + | dirname }} + slurmdbd_pid_dir_effective: >- + {{ (slurmdbd_merged_dict.get('PidFile', ['/var/run/slurmdbd.pid']) + | first if slurmdbd_merged_dict.get('PidFile') is iterable + and slurmdbd_merged_dict.get('PidFile') is not string + else slurmdbd_merged_dict.get('PidFile', '/var/run/slurmdbd.pid')) + | dirname }} + slurmdbd_plugin_dir_effective: >- + {{ (slurmdbd_merged_dict.get('PluginDir', ['/usr/lib64/slurm']) + | first if slurmdbd_merged_dict.get('PluginDir') is iterable + and slurmdbd_merged_dict.get('PluginDir') is not string + else slurmdbd_merged_dict.get('PluginDir', '/usr/lib64/slurm')) }} when: slurmdbd_merged_dict is defined # ── cgroup.conf path params ────────────────────────────────────────── - name: Extract effective cgroup mountpoint from cgroup.conf ansible.builtin.set_fact: - slurm_cgroup_mountpoint_effective: "{{ ((cgroup_merged_dict.get('CgroupMountpoint', ['']) | first if cgroup_merged_dict.get('CgroupMountpoint') is iterable and cgroup_merged_dict.get('CgroupMountpoint') is not string else cgroup_merged_dict.get('CgroupMountpoint', '')) | default('', true)) }}" + slurm_cgroup_mountpoint_effective: >- + {{ ((cgroup_merged_dict.get('CgroupMountpoint', ['']) + | first if cgroup_merged_dict.get('CgroupMountpoint') is iterable + and cgroup_merged_dict.get('CgroupMountpoint') is not string + else cgroup_merged_dict.get('CgroupMountpoint', '')) + | default('', true)) }} when: cgroup_merged_dict is defined # ── Defaults when confs are not merged ──────────────────────────────── From cde00165a54d541fde3f9c545a1746b44f74ad24 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Tue, 10 Feb 2026 11:57:07 +0530 Subject: [PATCH 101/172] update kafka and victoria utility --- utils/external_kafka_connect_details.yml | 43 ++++++++++++++++--- utils/external_victoria_connect_details.yml | 43 ++++++++++++++++--- .../vars/main.yml | 8 ++++ .../vars/main.yml | 8 ++++ 4 files changed, 88 insertions(+), 14 deletions(-) diff --git a/utils/external_kafka_connect_details.yml b/utils/external_kafka_connect_details.yml index 1f2093e54e..a55c54ad3b 100644 --- a/utils/external_kafka_connect_details.yml +++ b/utils/external_kafka_connect_details.yml @@ -18,14 +18,43 @@ connection: local gather_facts: false tasks: - - name: Fail if service_kube_control_plane group is missing or empty + - name: Load Kafka utility role variables + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/external_kafka_connect_details/vars/main.yml" + + - name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + + - name: Set HA config path + ansible.builtin.set_fact: + k8s_ha_config_path: "{{ input_project_dir }}/high_availability_config.yml" + + - name: Load High Availability config + ansible.builtin.include_vars: + file: "{{ k8s_ha_config_path }}" + name: ha_config + failed_when: false + register: ha_config_load + + - name: Fail when High Availability config cannot be loaded ansible.builtin.fail: - msg: >- - Inventory must define a 'service_kube_control_plane' group with exactly one host. - Provide either the service kube control plane VIP or one of the service kube control plane node IPs. - Run with '-i ' and ensure exactly one host is in that group. - when: - - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1 + msg: "{{ kafka_preflight_err_ha_config_missing }}" + when: ha_config_load.failed + + - name: Set service kube control plane VIP from HA config + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + + - name: Fail when service kube control plane VIP is not available + ansible.builtin.fail: + msg: "{{ kafka_preflight_err_ha_vip_missing }}" + when: (kube_vip | trim | length) == 0 + + - name: Create service_kube_control_plane group from VIP + ansible.builtin.add_host: + name: "{{ kube_vip }}" + groups: service_kube_control_plane - name: Fetch external Kafka connection details hosts: service_kube_control_plane diff --git a/utils/external_victoria_connect_details.yml b/utils/external_victoria_connect_details.yml index f955bbbc78..23e388baf6 100644 --- a/utils/external_victoria_connect_details.yml +++ b/utils/external_victoria_connect_details.yml @@ -18,14 +18,43 @@ connection: local gather_facts: false tasks: - - name: Fail if service_kube_control_plane group is missing or empty + - name: Load Victoria utility role variables + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/external_victoria_connect_details/vars/main.yml" + + - name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + + - name: Set HA config path + ansible.builtin.set_fact: + k8s_ha_config_path: "{{ input_project_dir }}/high_availability_config.yml" + + - name: Load High Availability config + ansible.builtin.include_vars: + file: "{{ k8s_ha_config_path }}" + name: ha_config + failed_when: false + register: ha_config_load + + - name: Fail when High Availability config cannot be loaded ansible.builtin.fail: - msg: >- - Inventory must define a 'service_kube_control_plane' group with exactly one host. - Provide either the service kube control plane VIP or one of the service kube control plane node IPs. - Run with '-i ' and ensure exactly one host is in that group. - when: - - groups['service_kube_control_plane'] is not defined or (groups['service_kube_control_plane'] | length) != 1 + msg: "{{ victoria_preflight_err_ha_config_missing }}" + when: ha_config_load.failed + + - name: Set service kube control plane VIP from HA config + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + + - name: Fail when service kube control plane VIP is not available + ansible.builtin.fail: + msg: "{{ victoria_preflight_err_ha_vip_missing }}" + when: (kube_vip | trim | length) == 0 + + - name: Create service_kube_control_plane group from VIP + ansible.builtin.add_host: + name: "{{ kube_vip }}" + groups: service_kube_control_plane - name: Fetch external Victoria connection details hosts: service_kube_control_plane diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml index ff257328a8..7a7d831275 100644 --- a/utils/roles/external_kafka_connect_details/vars/main.yml +++ b/utils/roles/external_kafka_connect_details/vars/main.yml @@ -29,6 +29,14 @@ kafka_err_external_ip_missing: >- Failed to fetch Kafka LoadBalancer external IP. Ensure service '{{ kafka_lb_service_name }}' exists in namespace '{{ kafka_namespace }}' and has an external IP assigned. +kafka_preflight_err_ha_config_missing: >- + Failed to load High Availability config file: {{ k8s_ha_config_path }}. + Provide a valid HA config so the service Kubernetes VIP can be used. + +kafka_preflight_err_ha_vip_missing: >- + Failed to determine the service Kubernetes control plane VIP from High Availability config. + Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}. + kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity" kafka_ome_ui_enable_label: "Enable Kafka Connectivity" kafka_ome_auth_mode_value: "SSL" diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml index c033adaa1c..c2de781fbf 100644 --- a/utils/roles/external_victoria_connect_details/vars/main.yml +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -30,6 +30,14 @@ victoria_err_lb_missing: >- Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. +victoria_preflight_err_ha_config_missing: >- + Failed to load High Availability config file: {{ k8s_ha_config_path }}. + Provide a valid HA config so the service Kubernetes VIP can be used. + +victoria_preflight_err_ha_vip_missing: >- + Failed to determine the service Kubernetes control plane VIP from High Availability config. + Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}. + victoria_sfm_ui_navigation: "Observability -> Settings -> Prometheus Remote Write" victoria_sfm_remote_write_target_name: "victoria" victoria_sfm_remote_write_message_version: "v1" From 160e9ec91564ec4f6e6ca2d1cffe583d7484e170 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 10 Feb 2026 11:58:08 +0530 Subject: [PATCH 102/172] Added upgrade logic for local_repo_config.yml, omnia_config.yml, provision_config.yml, storage_config.yml, security_config.yml and telemetry_config.yml --- .../import_input_parameters/tasks/main.yml | 15 ++ .../tasks/transform_local_repo_config.yml | 121 +++++++++ .../tasks/transform_omnia_config.yml | 103 ++++++++ .../tasks/transform_provision_config.yml | 100 ++++++++ .../tasks/transform_storage_config.yml | 130 ++++++++++ .../tasks/transform_telemetry_config.yml | 148 +++++++++++ .../templates/local_repo_config.j2 | 199 ++++++++++++++ .../templates/omnia_config.j2 | 160 ++++++++++++ .../templates/provision_config.j2 | 40 +++ .../templates/storage_config.j2 | 95 +++++++ .../templates/telemetry_config.j2 | 242 ++++++++++++++++++ .../import_input_parameters/vars/main.yml | 78 +++++- 12 files changed, 1430 insertions(+), 1 deletion(-) create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml create mode 100644 upgrade/roles/import_input_parameters/templates/local_repo_config.j2 create mode 100644 upgrade/roles/import_input_parameters/templates/omnia_config.j2 create mode 100644 upgrade/roles/import_input_parameters/templates/provision_config.j2 create mode 100644 upgrade/roles/import_input_parameters/templates/storage_config.j2 create mode 100644 upgrade/roles/import_input_parameters/templates/telemetry_config.j2 diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index 7687f852bb..ff77cf2c0e 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -22,5 +22,20 @@ - name: Transform high_availability_config.yml from Omnia 2.0 to 2.1 ansible.builtin.include_tasks: transform_high_availability_config.yml +- name: Transform local_repo_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_local_repo_config.yml + +- name: Transform provision_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_provision_config.yml + +- name: Transform storage_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_storage_config.yml + +- name: Transform omnia_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_omnia_config.yml + +- name: Transform telemetry_config.yml from Omnia 2.0 to 2.1 + ansible.builtin.include_tasks: transform_telemetry_config.yml + - name: Restore input files from backup ansible.builtin.include_tasks: restore_input_files.yml diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml new file mode 100644 index 0000000000..20c95798b1 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml @@ -0,0 +1,121 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup local_repo_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/local_repo_config.yml" + register: backup_local_repo_config_stat + +- name: Fail if backup local_repo_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_local_repo_config_missing }}" + when: not backup_local_repo_config_stat.stat.exists + +- name: Check if local_repo_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/local_repo_config.yml" + register: local_repo_config_stat + +- name: Fail if local_repo_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_local_repo_config_missing }}" + when: not local_repo_config_stat.stat.exists + +- name: Read backup local_repo_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/local_repo_config.yml" + register: backup_local_repo_config_slurp + +- name: Parse backup local_repo_config.yml + ansible.builtin.set_fact: + backup_local_repo_config: "{{ backup_local_repo_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize user_registry + ansible.builtin.set_fact: + local_repo_user_registry: >- + {{ + ( + backup_local_repo_config.user_registry + if (backup_local_repo_config.user_registry is defined) + else + ( + ( + (backup_local_repo_config.omnia_registry | default([])) + | select('string') + | map('regex_replace', '^(.*)$', '{"host": "\\1", "cert_path": "", "key_path": ""}') + | map('from_json') + | list + ) + ) + ) + }} + +- name: Normalize repo url keys to 2.1 schema + ansible.builtin.set_fact: + local_repo_user_repo_url_x86_64: "{{ backup_local_repo_config.user_repo_url_x86_64 | default(backup_local_repo_config.user_repo_url | default([])) }}" + local_repo_user_repo_url_aarch64: "{{ backup_local_repo_config.user_repo_url_aarch64 | default([]) }}" + local_repo_rhel_os_url_x86_64: "{{ backup_local_repo_config.rhel_os_url_x86_64 | default(backup_local_repo_config.rhel_os_url | default([])) }}" + local_repo_rhel_os_url_aarch64: "{{ backup_local_repo_config.rhel_os_url_aarch64 | default([]) }}" + local_repo_omnia_repo_url_rhel_x86_64: "{{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}" + local_repo_omnia_repo_url_rhel_aarch64: "{{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}" + local_repo_additional_repos_x86_64: "{{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }}" + local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}" + +- name: Fail if omnia_repo_url_rhel_x86_64 is missing + ansible.builtin.fail: + msg: "{{ msg_omnia_repo_url_rhel_x86_64_missing }}" + when: (local_repo_omnia_repo_url_rhel_x86_64 | default([]) | length) == 0 + +- name: Fail if omnia_repo_url_rhel_aarch64 is missing + ansible.builtin.fail: + msg: "{{ msg_omnia_repo_url_rhel_aarch64_missing }}" + when: (local_repo_omnia_repo_url_rhel_aarch64 | default([]) | length) == 0 + +- name: Write local_repo_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: local_repo_config.j2 + dest: "{{ input_project_dir }}/local_repo_config.yml" + mode: "{{ default_file_mode }}" + vars: + local_repo_user_registry: "{{ local_repo_user_registry }}" + local_repo_user_repo_url_x86_64: "{{ local_repo_user_repo_url_x86_64 }}" + local_repo_user_repo_url_aarch64: "{{ local_repo_user_repo_url_aarch64 }}" + local_repo_rhel_os_url_x86_64: "{{ local_repo_rhel_os_url_x86_64 }}" + local_repo_rhel_os_url_aarch64: "{{ local_repo_rhel_os_url_aarch64 }}" + local_repo_omnia_repo_url_rhel_x86_64: "{{ local_repo_omnia_repo_url_rhel_x86_64 }}" + local_repo_omnia_repo_url_rhel_aarch64: "{{ local_repo_omnia_repo_url_rhel_aarch64 }}" + local_repo_additional_repos_x86_64: "{{ local_repo_additional_repos_x86_64 }}" + local_repo_additional_repos_aarch64: "{{ local_repo_additional_repos_aarch64 }}" + +- name: Validate YAML syntax of transformed local_repo_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/local_repo_config.yml','r'))" + register: local_repo_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - local_repo_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_local_repo_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_local_repo_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml new file mode 100644 index 0000000000..ab62c3ff28 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml @@ -0,0 +1,103 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup omnia_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/omnia_config.yml" + register: backup_omnia_config_stat + +- name: Fail if backup omnia_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_omnia_config_missing }}" + when: not backup_omnia_config_stat.stat.exists + +- name: Check if omnia_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/omnia_config.yml" + register: omnia_config_stat + +- name: Fail if omnia_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_omnia_config_missing }}" + when: not omnia_config_stat.stat.exists + +- name: Read backup omnia_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/omnia_config.yml" + register: backup_omnia_config_slurp + +- name: Parse backup omnia_config.yml + ansible.builtin.set_fact: + backup_omnia_config: "{{ backup_omnia_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize omnia_config.yml values + ansible.builtin.set_fact: + omnia_slurm_cluster_raw: "{{ backup_omnia_config.slurm_cluster | default([]) }}" + omnia_service_k8s_cluster_raw: "{{ backup_omnia_config.service_k8s_cluster | default([]) }}" + +- name: Ensure slurm_cluster and service_k8s_cluster are lists + ansible.builtin.set_fact: + omnia_slurm_cluster: >- + {{ + [omnia_slurm_cluster_raw] + if (omnia_slurm_cluster_raw is mapping) + else omnia_slurm_cluster_raw + }} + omnia_service_k8s_cluster: >- + {{ + [omnia_service_k8s_cluster_raw] + if (omnia_service_k8s_cluster_raw is mapping) + else omnia_service_k8s_cluster_raw + }} + +- name: Fail if slurm_cluster is missing + ansible.builtin.fail: + msg: "{{ msg_slurm_cluster_missing }}" + when: (omnia_slurm_cluster | default([]) | length) == 0 + +- name: Fail if service_k8s_cluster is missing + ansible.builtin.fail: + msg: "{{ msg_service_k8s_cluster_missing }}" + when: (omnia_service_k8s_cluster | default([]) | length) == 0 + +- name: Write omnia_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: omnia_config.j2 + dest: "{{ input_project_dir }}/omnia_config.yml" + mode: "{{ default_file_mode }}" + vars: + omnia_slurm_cluster: "{{ omnia_slurm_cluster }}" + omnia_service_k8s_cluster: "{{ omnia_service_k8s_cluster }}" + +- name: Validate YAML syntax of transformed omnia_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/omnia_config.yml','r'))" + register: omnia_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - omnia_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_omnia_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_omnia_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml new file mode 100644 index 0000000000..71e9ee0dc2 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml @@ -0,0 +1,100 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup provision_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/provision_config.yml" + register: backup_provision_config_stat + +- name: Fail if backup provision_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_provision_config_missing }}" + when: not backup_provision_config_stat.stat.exists + +- name: Check if provision_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/provision_config.yml" + register: provision_config_stat + +- name: Fail if provision_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_provision_config_missing }}" + when: not provision_config_stat.stat.exists + +- name: Read backup provision_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/provision_config.yml" + register: backup_provision_config_slurp + +- name: Parse backup provision_config.yml + ansible.builtin.set_fact: + backup_provision_config: "{{ backup_provision_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize provision_config.yml values + ansible.builtin.set_fact: + provision_pxe_mapping_file_path_raw: >- + {{ + backup_provision_config.pxe_mapping_file_path + | default('/opt/omnia/input/project_default/pxe_mapping_file.csv') + }} + provision_language: "{{ backup_provision_config.language | default('en_US.UTF-8') }}" + provision_default_lease_time: "{{ backup_provision_config.default_lease_time | default('86400') }}" + +- name: Rewrite legacy pxe_mapping_file_path to current project input directory + ansible.builtin.set_fact: + provision_pxe_mapping_file_path: >- + {{ + ( + provision_pxe_mapping_file_path_raw + | string + | regex_replace('^/opt/omnia/input/project_default/', input_project_dir ~ '/') + ) + }} + +- name: Fail if pxe_mapping_file_path is missing + ansible.builtin.fail: + msg: "{{ msg_pxe_mapping_file_path_missing }}" + when: (provision_pxe_mapping_file_path | string | trim) == '' + +- name: Write provision_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: provision_config.j2 + dest: "{{ input_project_dir }}/provision_config.yml" + mode: "{{ default_file_mode }}" + vars: + provision_pxe_mapping_file_path: "{{ provision_pxe_mapping_file_path }}" + provision_language: "{{ provision_language }}" + provision_default_lease_time: "{{ provision_default_lease_time }}" + +- name: Validate YAML syntax of transformed provision_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/provision_config.yml','r'))" + register: provision_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - provision_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_provision_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_provision_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml new file mode 100644 index 0000000000..72b82aa7f8 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml @@ -0,0 +1,130 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup storage_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/storage_config.yml" + register: backup_storage_config_stat + +- name: Fail if backup storage_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_storage_config_missing }}" + when: not backup_storage_config_stat.stat.exists + +- name: Check if storage_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/storage_config.yml" + register: storage_config_stat + +- name: Fail if storage_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_storage_config_missing }}" + when: not storage_config_stat.stat.exists + +- name: Read backup storage_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/storage_config.yml" + register: backup_storage_config_slurp + +- name: Parse backup storage_config.yml + ansible.builtin.set_fact: + backup_storage_config: "{{ backup_storage_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize storage_config.yml values + ansible.builtin.set_fact: + storage_nfs_client_params: "{{ backup_storage_config.nfs_client_params | default([]) }}" + storage_powervault_config: "{{ backup_storage_config.powervault_config | default(none) }}" + storage_has_powervault: "{{ backup_storage_config.powervault_config is defined }}" + +- name: Fail if powervault_config is present but missing mandatory keys + ansible.builtin.fail: + msg: "{{ msg_powervault_missing_keys }}" + when: + - storage_has_powervault + - storage_powervault_config.ip is not defined or (storage_powervault_config.ip | default([]) | length) == 0 + or storage_powervault_config.isci_initiators is not defined + or (storage_powervault_config.isci_initiators | string | trim) == '' + or storage_powervault_config.volume_id is not defined + or (storage_powervault_config.volume_id | string | trim) == '' + +- name: Fail if nfs_client_params is missing + ansible.builtin.fail: + msg: "{{ msg_nfs_client_params_missing }}" + when: (storage_nfs_client_params | default([]) | length) == 0 + +- name: Fail if any NFS client entry is missing required keys + ansible.builtin.fail: + msg: "{{ msg_nfs_client_param_entry_missing_keys }}" + when: >- + {{ + ( + storage_nfs_client_params + | selectattr('server_ip', 'undefined') + | list + | length + ) > 0 + or + ( + storage_nfs_client_params + | selectattr('server_share_path', 'undefined') + | list + | length + ) > 0 + or + ( + storage_nfs_client_params + | selectattr('client_share_path', 'undefined') + | list + | length + ) > 0 + or + ( + storage_nfs_client_params + | selectattr('client_mount_options', 'undefined') + | list + | length + ) > 0 + }} + +- name: Write storage_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: storage_config.j2 + dest: "{{ input_project_dir }}/storage_config.yml" + mode: "{{ default_file_mode }}" + vars: + storage_nfs_client_params: "{{ storage_nfs_client_params }}" + storage_powervault_config: "{{ storage_powervault_config }}" + storage_has_powervault: "{{ storage_has_powervault }}" + +- name: Validate YAML syntax of transformed storage_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/storage_config.yml','r'))" + register: storage_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - storage_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_storage_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_storage_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml new file mode 100644 index 0000000000..1aa095e66b --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -0,0 +1,148 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup telemetry_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/telemetry_config.yml" + register: backup_telemetry_config_stat + +- name: Fail if backup telemetry_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_telemetry_config_missing }}" + when: not backup_telemetry_config_stat.stat.exists + +- name: Check if telemetry_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/telemetry_config.yml" + register: telemetry_config_stat + +- name: Fail if telemetry_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_telemetry_config_missing }}" + when: not telemetry_config_stat.stat.exists + +- name: Read backup telemetry_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/telemetry_config.yml" + register: backup_telemetry_config_slurp + +- name: Parse backup telemetry_config.yml + ansible.builtin.set_fact: + backup_telemetry_config: "{{ backup_telemetry_config_slurp.content | b64decode | from_yaml }}" + +- name: Normalize nested backup telemetry sections + ansible.builtin.set_fact: + backup_telemetry_victoria_config: "{{ backup_telemetry_config.victoria_configurations | default({}) }}" + backup_telemetry_kafka_config: "{{ backup_telemetry_config.kafka_configurations | default({}) }}" + +- name: Normalize telemetry_config.yml values + ansible.builtin.set_fact: + telemetry_idrac_telemetry_support: "{{ backup_telemetry_config.idrac_telemetry_support | default(true) }}" + telemetry_idrac_telemetry_collection_type: >- + {{ + backup_telemetry_config.idrac_telemetry_collection_type + | default('victoria,kafka') + }} + telemetry_victoria_deployment_mode: "{{ backup_telemetry_victoria_config.deployment_mode | default('cluster') }}" + telemetry_victoria_persistence_size: "{{ backup_telemetry_victoria_config.persistence_size | default('8Gi') }}" + telemetry_victoria_retention_period: "{{ backup_telemetry_victoria_config.retention_period | default(168) }}" + telemetry_kafka_persistence_size: "{{ backup_telemetry_kafka_config.persistence_size | default('8Gi') }}" + telemetry_kafka_log_retention_hours: "{{ backup_telemetry_kafka_config.log_retention_hours | default(168) }}" + telemetry_kafka_log_retention_bytes: "{{ backup_telemetry_kafka_config.log_retention_bytes | default(-1) }}" + telemetry_kafka_log_segment_bytes: "{{ backup_telemetry_kafka_config.log_segment_bytes | default(1073741824) }}" + telemetry_kafka_topic_partitions: >- + {{ + backup_telemetry_kafka_config.topic_partitions + | default([ + {'name': 'idrac', 'partitions': 1}, + {'name': 'ldms', 'partitions': 2} + ]) + }} + telemetry_ldms_agg_port: "{{ backup_telemetry_config.ldms_agg_port | default(6001) }}" + telemetry_ldms_store_port: "{{ backup_telemetry_config.ldms_store_port | default(6001) }}" + telemetry_ldms_sampler_port: "{{ backup_telemetry_config.ldms_sampler_port | default(10001) }}" + telemetry_ldms_sampler_configurations: >- + {{ + backup_telemetry_config.ldms_sampler_configurations + | default([ + { + 'plugin_name': 'meminfo', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'procstat2', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'vmstat', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'loadavg', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000' + }, + { + 'plugin_name': 'procnetdev2', + 'config_parameters': '', + 'activation_parameters': 'interval=1000000 offset=0' + } + ]) + }} + +- name: Write telemetry_config.yml in Omnia 2.1 format + ansible.builtin.template: + src: telemetry_config.j2 + dest: "{{ input_project_dir }}/telemetry_config.yml" + mode: "{{ default_file_mode }}" + vars: + telemetry_idrac_telemetry_support: "{{ telemetry_idrac_telemetry_support }}" + telemetry_idrac_telemetry_collection_type: "{{ telemetry_idrac_telemetry_collection_type }}" + telemetry_victoria_deployment_mode: "{{ telemetry_victoria_deployment_mode }}" + telemetry_victoria_persistence_size: "{{ telemetry_victoria_persistence_size }}" + telemetry_victoria_retention_period: "{{ telemetry_victoria_retention_period }}" + telemetry_kafka_persistence_size: "{{ telemetry_kafka_persistence_size }}" + telemetry_kafka_log_retention_hours: "{{ telemetry_kafka_log_retention_hours }}" + telemetry_kafka_log_retention_bytes: "{{ telemetry_kafka_log_retention_bytes }}" + telemetry_kafka_log_segment_bytes: "{{ telemetry_kafka_log_segment_bytes }}" + telemetry_kafka_topic_partitions: "{{ telemetry_kafka_topic_partitions }}" + telemetry_ldms_agg_port: "{{ telemetry_ldms_agg_port }}" + telemetry_ldms_store_port: "{{ telemetry_ldms_store_port }}" + telemetry_ldms_sampler_port: "{{ telemetry_ldms_sampler_port }}" + telemetry_ldms_sampler_configurations: "{{ telemetry_ldms_sampler_configurations }}" + +- name: Validate YAML syntax of transformed telemetry_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/telemetry_config.yml','r'))" + register: telemetry_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - telemetry_yaml_validation.rc != 0 + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_telemetry_config }}" + when: true + +- name: Display transformation summary + ansible.builtin.debug: + msg: "{{ msg_telemetry_config_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 new file mode 100644 index 0000000000..dbe38d70ad --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 @@ -0,0 +1,199 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# ================================ +# VARIABLE DETAILS +# ================================ +# 1. user_registry +#-------------------------- +# Configuration for user registry to configure additional images in Pulp +# Fields: +# host : Registry IP and port in format "IP:port" +# cert_path : Path to SSL certificate file (.crt) - Required only if host is using HTTPS +# key_path : Path to SSL private key file (.key) - Required only if host is using HTTPS +# Notes: +# - If host is HTTPS, cert_path and key_path are required +# - If host is HTTP, cert_path and key_path can be left empty +# - cert_path should point to .crt files only +# - key_path should point to .key files only +# - cert and key paths are accessed from within the omnia_core container +# 2. user_repo_url_x86_64 +#-------------------------- +# Optional list of user-defined repository URLs for x86_64 architecture. +# Each entry can include: url, gpgkey, sslcacert, sslclientkey, sslclientcert, name, policy. +# Used for custom cluster packages like _slurm_custom. +# Fields: +# url : Base URL of the repository +# gpgkey : GPG key URL (leave empty to disable gpgcheck; Omnia will trust this repo and user is responsible for its security) +# name : Name of the repository +# sslcacert : Path to SSL CA certificate (if using SSL) +# sslclientkey: Path to SSL client key (if using SSL) +# sslclientcert: Path to SSL client certificate (if using SSL) +# policy : Repository policy (always, partial) +# Notes: +# - Do not use Jinja variables in this configuration. +# - Omit SSL fields entirely if SSL is not in use. +# - Its a madatory field in case of slurm_custom with name as '_slurm_custom' +# +# 3. user_repo_url_aarch64 +#--------------------------- +# Same as above but for aarch64 architecture. +# +# 4. rhel_os_url_x86_64 +#----------------------------- +# Mandatory when RHEL subscription is not registered. +# Contains repository URLs for codeready-builder, baseos, and appstream for x86_64. +# Fields: +# url : Base URL of the repository +# gpgkey : GPG key URL (leave empty to disable gpgcheck; Omnia will trust this repo and user is responsible for its security) +# sslcacert : Path to SSL CA certificate (if using SSL) +# sslclientkey: Path to SSL client key (if using SSL) +# sslclientcert: Path to SSL client certificate (if using SSL) +# policy : Repository policy if mentioned allowed values (always, partial). IF not mentioned will consider from software_config.json +# name : Name of the repository [ Allowed repo names _codeready-builder, _appstream, _baseos +# Notes: +# - Do not use Jinja variables in this configuration. +# - Omit SSL fields entirely if SSL is not in use. +# - RHEL subscription is not registered, All 3 repositories [ _codeready-builder, _appstream, _baseos ]entries +# are mandatory. +# +# 5. rhel_os_url_aarch64 +#---------------------------- +# Same as above but for aarch64 architecture. +# +#### ADVANCE CONFIGURATIONS FOR LOCAL REPO ### +# 6. omnia_repo_url_rhel_x86_64 +#------------------------------- +# Mandatory repository URLs for downloading RPMS for Omnia features on RHEL x86_64. +# Each entry includes url, gpgkey, and name. +# +# This variable defines all the repo urls from where rpms will be downloaded for omnia features when cluster_os_type is rhel and arch x86_64 +# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# Fields: +# url : Base URL of the repository. +# gpgkey : URL of the GPG key for the repository. +# If left empty, gpgcheck=0 for that repository. +# name : A unique identifier for the repository or registry. +# +# 7. omnia_repo_url_rhel_aarch64 +#-------------------------------- +# Same as above but for RHEL aarch64. +# +# 8. additional_repos_x86_64 +#---------------------------- +# Optional list of additional repository URLs for x86_64 architecture. +# These repos are aggregated into a single Pulp repository, allowing dynamic +# addition/removal without changing compute node configurations. +# Fields: +# url : Base URL of the repository (required) +# gpgkey : GPG key URL (required, can be empty - disables gpgcheck) +# name : Unique name for the repository (required) +# sslcacert : Path to SSL CA certificate (optional) +# sslclientkey : Path to SSL client key (optional) +# sslclientcert : Path to SSL client certificate (optional) +# Notes: +# - All repos are synced into a single aggregated Pulp repository +# - Compute nodes are configured once with a fixed URL that never changes +# - Policy is controlled globally via repo_config in software_config.json (per-entry policy not supported) +# - Name must be unique within this list and must not conflict with names in other repo keys +# - Packages from these repos can only be used via additional_packages.json +# +# 9. additional_repos_aarch64 +#----------------------------- +# Same as above but for aarch64 architecture. + +# ================================ +# VARIABLES +# ================================ +# user_registry: +# - { host: "172.16.107.254:4000", cert_path: "/opt/omnia/domain.crt", key_path: "/opt/omnia/domain.key" } +user_registry: +{% set _user_registry = local_repo_user_registry | default([]) %} +{% if (_user_registry | length) > 0 %} +{% for _reg in _user_registry %} + - { host: {{ (_reg.host | default('')) | to_json }}, cert_path: {{ (_reg.cert_path | default('')) | to_json }}, key_path: {{ (_reg.key_path | default('')) | to_json }} } +{% endfor %} +{% endif %} +# user_repo_url_x86_64: +# - { url: "", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_slurm_custom" } +user_repo_url_x86_64: +{% set _user_repo_url_x86_64 = local_repo_user_repo_url_x86_64 | default([]) %} +{% if (_user_repo_url_x86_64 | length) > 0 %} +{% for _repo in _user_repo_url_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +user_repo_url_aarch64: +{% set _user_repo_url_aarch64 = local_repo_user_repo_url_aarch64 | default([]) %} +{% if (_user_repo_url_aarch64 | length) > 0 %} +{% for _repo in _user_repo_url_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +#Example: +# rhel_os_url_x86_64: +# - { url: "http://crb.com/CRB/x86_64/os/", gpgkey: "http://crb.com/CRB/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_codeready-builder"} +# - { url: "http://BaseOS.com/BaseOS/x86_64/os/", gpgkey: "http://BaseOS.com/BaseOS/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_baseos"} +# - { url: "http://AppStream.com/AppStream/x86_64/os/", gpgkey: "http://AppStream.com/AppStream/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_appstream" } +rhel_os_url_x86_64: +{% set _rhel_os_url_x86_64 = local_repo_rhel_os_url_x86_64 | default([]) %} +{% if (_rhel_os_url_x86_64 | length) > 0 %} +{% for _repo in _rhel_os_url_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, policy: {{ (_repo.policy | default('')) | to_json }} } +{% endfor %} +{% endif %} +rhel_os_url_aarch64: +{% set _rhel_os_url_aarch64 = local_repo_rhel_os_url_aarch64 | default([]) %} +{% if (_rhel_os_url_aarch64 | length) > 0 %} +{% for _repo in _rhel_os_url_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, policy: {{ (_repo.policy | default('')) | to_json }} } +{% endfor %} +{% endif %} +# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +omnia_repo_url_rhel_x86_64: +{% set _omnia_repo_url_rhel_x86_64 = local_repo_omnia_repo_url_rhel_x86_64 | default([]) %} +{% if (_omnia_repo_url_rhel_x86_64 | length) > 0 %} +{% for _repo in _omnia_repo_url_rhel_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +omnia_repo_url_rhel_aarch64: +{% set _omnia_repo_url_rhel_aarch64 = local_repo_omnia_repo_url_rhel_aarch64 | default([]) %} +{% if (_omnia_repo_url_rhel_aarch64 | length) > 0 %} +{% for _repo in _omnia_repo_url_rhel_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }} } +{% endfor %} +{% endif %} +# Example: +# additional_repos_x86_64: +# - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" } +# - { url: "https://repo.example.com/x86_64/", gpgkey: "", name: "custom-repo", sslcacert: "/path/ca.crt", sslclientkey: "/path/client.key", sslclientcert: "/path/client.crt" } +additional_repos_x86_64: +{% set _additional_repos_x86_64 = local_repo_additional_repos_x86_64 | default([]) %} +{% if (_additional_repos_x86_64 | length) > 0 %} +{% for _repo in _additional_repos_x86_64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }} } +{% endfor %} +{% endif %} +additional_repos_aarch64: +{% set _additional_repos_aarch64 = local_repo_additional_repos_aarch64 | default([]) %} +{% if (_additional_repos_aarch64 | length) > 0 %} +{% for _repo in _additional_repos_aarch64 %} + - { url: {{ (_repo.url | default('')) | to_json }}, gpgkey: {{ (_repo.gpgkey | default('')) | to_json }}, name: {{ (_repo.name | default('')) | to_json }}, sslcacert: {{ (_repo.sslcacert | default('')) | to_json }}, sslclientkey: {{ (_repo.sslclientkey | default('')) | to_json }}, sslclientcert: {{ (_repo.sslclientcert | default('')) | to_json }} } +{% endfor %} +{% endif %} diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 new file mode 100644 index 0000000000..aec7a05ab7 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 @@ -0,0 +1,160 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# -----------------------------SLURM------------------------------------------------ +# slurm_cluster +# List of slurm clusters +# cluster_name is required field + +# nfs_storage_name +# Storage name corresponding to the NFS share to be used by slurm cluster +# This should match with exactly with a entry in storage_config.yml + +# config_sources +# defines how the Slurm configuration files are provided to the cluster. +# : +# or +# Supply the configuration values directly as a key–value map +# Supply the absolute path to a custom configuration file +# The conf files supported by slurm are +# slurm +# cgroup +# slurmdbd +# gres +# Thes files will be written into the slurm_config directory with .conf suffix + +slurm_cluster: +{% set _slurm_cluster = omnia_slurm_cluster | default([]) %} +{% if (_slurm_cluster | length) > 0 %} +{% for _cluster in _slurm_cluster %} + - cluster_name: {{ _cluster.cluster_name | default('') }} + nfs_storage_name: {{ _cluster.nfs_storage_name | default('') }} +{% if _cluster.config_sources is defined and (_cluster.config_sources | length > 0) %} + config_sources: +{% set _supported = ['slurm', 'cgroup', 'slurmdbd', 'gres'] %} +{% for _conf_name, _conf_val in _cluster.config_sources.items() %} +{% if _conf_name in _supported %} +{% if _conf_name == 'cgroup' and (_conf_val is mapping) %} + cgroup: + CgroupPlugin: {{ _conf_val.CgroupPlugin | default('autodetect') }} +{% for _k, _v in _conf_val.items() %} +{% if _k not in ['AllowedRAMSpace', 'CgroupPlugin', 'ConstrainCores', 'ConstrainDevices', 'ConstrainRAMSpace', 'ConstrainSwapSpace'] %} + {{ _k }}: {{ _v }} +{% endif %} +{% endfor %} + ConstrainCores: {{ _conf_val.ConstrainCores | default(true) }} + ConstrainDevices: {{ _conf_val.ConstrainDevices | default(true) }} + ConstrainRAMSpace: {{ _conf_val.ConstrainRAMSpace | default(true) }} + ConstrainSwapSpace: {{ _conf_val.ConstrainSwapSpace | default(true) }} +{% if _conf_val.AllowedRAMSpace is defined %} + ### AllowedRAMSpace: {{ _conf_val.AllowedRAMSpace }} This is not supported in 2.1, just attached for reference +{% endif %} +{% elif _conf_val is mapping %} + {{ _conf_name }}: +{% for _k, _v in _conf_val.items() %} + {{ _k }}: {{ _v }} +{% endfor %} +{% else %} + {{ _conf_name }}: {{ _conf_val }} +{% endif %} +{% endif %} +{% endfor %} + # OR + + # config_sources: + # slurm: /path/to/custom_slurm.conf + # cgroup: /path/to/custom_cgroup.conf + # slurmdbd: /path/to/custom_slurmdbd.conf + # gres: /path/to/custom_gres.conf +{% else %} + # config_sources: + # slurm: + # SlurmctldTimeout: 60 + # SlurmdTimeout: 150 + # cgroup: + # CgroupPlugin: autodetect + # ConstrainCores: True + # ConstrainDevices: True + # ConstrainRAMSpace: True + # ConstrainSwapSpace: True + + # OR + + # config_sources: + # slurm: /path/to/custom_slurm.conf + # cgroup: /path/to/custom_cgroup.conf + # slurmdbd: /path/to/custom_slurmdbd.conf + # gres: /path/to/custom_gres.conf +{% endif %} +{% endfor %} +{% endif %} + +# ----------------------------SERVICE K8S------------------------------------------------------ +# For service k8s cluster below parameters are required,(List) +# - cluster_name is required field + +# - deployment: Exactly one entry in both the service_k8s_cluster lists must have deployment set to true to indicate where Kubernetes should be deployed. +# Please ensure corresponding cluster entry is added to high_availability_config.yml if deployment is set to true. + +# - Kubernetes SDN network.K8s_cni (Mandatory) - It can either be "calico" or "flannel".Default value assigned is "calico". +# While setting up Kubernetes plugin for RoCE NIC, ensure that this value is set to "flannel" + +# - pod_external_ip_range: (Mandatory) These addresses will be used by Loadbalancer for assigning External IPs to K8s services +# Make sure the IP range is not assigned to any node in the cluster. +# Acceptable formats: "10.11.0.100-10.11.0.150" , "10.11.0.0/16" + +# - k8s_service_addresses: Kubernetes internal network for services.This network must be unused in your network infrastructure. +# Default value is "10.233.0.0/18" + +# - k8s_pod_network_cidr: Kubernetes pod network CIDR for internal network. When used, it will assign IP addresses from this range to individual pods. +# This network must be unused in your network infrastructure. +# Default value is "10.233.64.0/18" + +# nfs_storage_name : The nfs name should be same as one of the nfs name defined in storage_config.yml to configure the server. +# ----------------------------CSI Driver------------------------------------------------------ +# Following csi powerscale driver input variables are mandatory only if csi_driver_powerscale entry is present in software_config.json +# csi_powerscale_driver_secret_file_path: Absolute file path for the secret.yaml file. +# User need to download secret.yaml file and fill required data in secret file. Provided the path of the secret file here. +# File path for the values.yml file which will contain the Powerscale driver configuration parameters. +# csi_powerscale_driver_values_file_path: User need to download values.yaml file and fill required data in values.yaml file. +# Provided the path of the values.yaml file here. mention configurable values + +# - k8s_crio_storage_size: Specifies the disk size allocated for CRI-O container storage. +# This storage is used to store container images, writable layers, and runtime data. +# Acceptable formats: "10G", "15G", "50G" (Only positive values in Gigabytes are allowed) +# Default value is "20G" + + +service_k8s_cluster: +{% set _service_k8s_cluster = omnia_service_k8s_cluster | default([]) %} +{% if (_service_k8s_cluster | length) > 0 %} +{% for _cluster in _service_k8s_cluster %} + - cluster_name: {{ _cluster.cluster_name | default('') }} + deployment: {{ _cluster.deployment | default(false) }} + k8s_cni: {{ _cluster.k8s_cni | default('calico') }} + pod_external_ip_range: "{{ _cluster.pod_external_ip_range | default('') }}" + k8s_service_addresses: "{{ _cluster.k8s_service_addresses | default('') }}" + k8s_pod_network_cidr: "{{ _cluster.k8s_pod_network_cidr | default('') }}" + nfs_storage_name: "{{ _cluster.nfs_storage_name | default('') }}" + csi_powerscale_driver_secret_file_path: "{{ _cluster.csi_powerscale_driver_secret_file_path | default('') }}" + csi_powerscale_driver_values_file_path: "{{ _cluster.csi_powerscale_driver_values_file_path | default('') }}" + k8s_crio_storage_size: {{ _cluster.k8s_crio_storage_size | default('20G') }} +{% endfor %} +{% endif %} diff --git a/upgrade/roles/import_input_parameters/templates/provision_config.j2 b/upgrade/roles/import_input_parameters/templates/provision_config.j2 new file mode 100644 index 0000000000..01fd84b2cf --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/provision_config.j2 @@ -0,0 +1,40 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +#### Mandatory +# This depicts the path where user has kept the PXE mapping file. +# The mapping file consists of the Service tag, Admin MAC,Hostname and its respective admin IP address and/or BMC IP. +# Ensure that admin IPs given in mapping file are within the network defined in the network_spec.yml +# A templates for mapping file exists in omnia/examples, namely, pxe_mapping_file.csv +# Format of csv: FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +pxe_mapping_file_path: "{{ provision_pxe_mapping_file_path }}" + +#### Mandatory +# Language that needs to be set during OS provisioning. +# Only language supported is "en_US.UTF-8" +language: "{{ provision_language }}" + +#### Mandatory +# Default lease time needs to be used by DHCP +# Unit: seconds +# Min: 21600 +# Default: 86400 +# Max: 31536000 +default_lease_time: "{{ provision_default_lease_time }}" diff --git a/upgrade/roles/import_input_parameters/templates/storage_config.j2 b/upgrade/roles/import_input_parameters/templates/storage_config.j2 new file mode 100644 index 0000000000..1c695a19a5 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/storage_config.j2 @@ -0,0 +1,95 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# -----------------------------Powervault------------------------------------------- +# powervault_config +# ip: ipv4 +# A list of PowerVault controller IP addresses used for iSCSI target discovery and login. +# In this configuration, a single controller portal is provided. + +# port: +# Defines the TCP port for the iSCSI target service. +# Port 3260 is the standard port for iSCSI communication. + +# isci_initiators: +# Specifies the InitiatorName used by the host when connecting to the iSCSI target. +# This IQN uniquely identifies the host to the storage array. + +# volume_id: +# This is the unique WWN/identifier for the +# specific volume that should be used for persistent storage. +# The script uses this value during multipath scanning to select the correct mapped device + +#powervault_config: +# ip: +# - 172.1.2.3 +# port: 3260 +# isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 +# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 + +{% if storage_has_powervault %} +powervault_config: +{% if storage_powervault_config.ip is defined %} + ip: +{% for _ip in (storage_powervault_config.ip | default([])) %} + - {{ _ip }} +{% endfor %} +{% else %} + ip: [] +{% endif %} +{% if storage_powervault_config.port is defined %} + port: {{ storage_powervault_config.port }} +{% endif %} + isci_initiators: {{ storage_powervault_config.isci_initiators | default('') }} + volume_id: {{ storage_powervault_config.volume_id | default('') }} +{% endif %} + +# -----------------------------NFS------------------------------------------------ + +# This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node +# This takes a list of dicts with possible keys server_ip, server_share_path, client_share_path, client_mount_options +# In both the cases, the USER must manually update 'server_ip' and 'server_share_path' below with the correct values. +# If mount_option values are empty, NFS client will be mounted with these values "nosuid,rw,sync,hard,intr" +# Its mandatory to provide atleast one entry in nfs_client_params +# Example for single mount file system: +# nfs_client_params: +# nfs_name : str ,Name of the NFS storage resource. The default is "nfs_storage_default". +# The user can assign any custom string to specify a different NFS storage resource. +# - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} +# Example for supporting multiple mount points: +# nfs_client_params: +# - { server_ip: 198.168.0.1,server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} +# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} +# Example for multiple mount file system: +# nfs_client_params: +# - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard"} +# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} + +nfs_client_params: +{% set _nfs = storage_nfs_client_params | default([]) %} +{% for _entry in _nfs %} + - server_ip: "{{ _entry.server_ip | default('') }}" # Provide the IP of the NFS server + server_share_path: "{{ _entry.server_share_path | default('') }}" # Provide server share path of the NFS Server + client_share_path: {{ _entry.client_share_path | default('') }} + client_mount_options: "{{ _entry.client_mount_options | default('nosuid,rw,sync,hard,intr') }}" +{% if _entry.nfs_name is defined %} + nfs_name: {{ _entry.nfs_name }} +{% endif %} + +{% endfor %} diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 new file mode 100644 index 0000000000..cb89944e1c --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -0,0 +1,242 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# ============================================================================ +# TELEMETRY CONFIGURATION OVERVIEW +# ============================================================================ +# This file configures telemetry data collection and storage for Dell Omnia. +# +# SECTIONS: +# 1. iDRAC Telemetry : Hardware metrics from Dell PowerEdge servers +# 2. VictoriaMetrics : Time-series database for metric storage +# 3. Kafka : Distributed streaming platform for telemetry data +# 4. LDMS : Lightweight Distributed Metric Service for compute nodes +# +# ============================================================================ +# STORAGE REQUIREMENTS SUMMARY +# ============================================================================ +# +# VICTORIAMETRICS STORAGE: +# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ +# │ Deployment Mode │ Per-Pod Storage │ Number of Pods │ Total Storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Single-node │ persistence_size │ 1 pod │ 1× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Cluster │ persistence_size │ 3 vmstorage │ 3× storage │ +# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ +# Example: 8Gi per pod → Single-node: 8Gi total, Cluster: 24Gi total +# +# KAFKA STORAGE: +# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ +# │ Component │ Per-Pod Storage │ Number of Pods │ Total Storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Kafka Broker │ persistence_size │ 3 pods │ 3× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ Kafka Controller│ persistence_size │ 3 pods │ 3× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ TOTAL KAFKA │ persistence_size │ 6 pods │ 6× storage │ +# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ +# Example: 8Gi per pod → 48Gi total Kafka storage +# +# COMBINED STORAGE EXAMPLES: +# Default (8Gi each): VictoriaMetrics Cluster (24Gi) + Kafka (48Gi) = 72Gi total +# Single-node mode: VictoriaMetrics Single (8Gi) + Kafka (48Gi) = 56Gi total +# +# STORAGE OPTIONS: +# - VictoriaMetrics: Store iDRAC telemetry in time-series database +# - Kafka: Stream iDRAC and LDMS telemetry to Kafka topics +# - Both: Store iDRAC in both Victoria and Kafka (recommended) +# ============================================================================ + +# ============================================================================ +# iDRAC TELEMETRY CONFIGURATION +# ============================================================================ +# iDRAC telemetry collects hardware metrics from Dell PowerEdge servers. +# Telemetry data can be stored in VictoriaMetrics, Kafka, or both. + +# Enable or disable iDRAC telemetry support +# Accepted values: true or false +# Default: true +idrac_telemetry_support: {{ telemetry_idrac_telemetry_support | default(true) | bool | ternary('true', 'false') }} + +# Specify where to store iDRAC telemetry data +# Supported values: +# - "victoria" : Store in VictoriaMetrics only +# - "kafka" : Store in Kafka only +# - "victoria,kafka" : Store in both (recommended) +# Default: "victoria,kafka" +idrac_telemetry_collection_type: {{ telemetry_idrac_telemetry_collection_type | default('victoria,kafka') | to_json }} + +# ============================================================================ +# VICTORIAMETRICS CONFIGURATION +# ============================================================================ +# VictoriaMetrics is a time-series database for storing telemetry metrics. +# Used for iDRAC telemetry when 'victoria' is enabled in idrac_telemetry_collection_type. +# +# DEPLOYMENT MODES: +# - single-node: Simple deployment with one pod (suitable for small deployments) +# - cluster: High-availability deployment with multiple components +# (recommended for production and large-scale deployments) +victoria_configurations: + # VictoriaMetrics deployment mode + # Supported values: + # - "single-node" : Simple deployment (1 pod, suitable for dev/test) + # - "cluster" : High-availability deployment (7 pods, recommended for production) + # Default: "cluster" + # + # Cluster Mode Benefits: + # - High availability (no single point of failure) + # - Horizontal scalability (scale components independently) + # - Better performance (4x ingestion, 2x query speed) + # - Production-ready architecture + # + # Single-Node Benefits: + # - Simple setup (fewer resources) + # - Suitable for small deployments (<10 nodes) + # - Lower resource usage (~4Gi memory vs ~10Gi for cluster) + deployment_mode: {{ telemetry_victoria_deployment_mode | default('cluster') | to_json }} + + # The amount of storage allocated for EACH VictoriaMetrics persistent volume. + # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode: + # - Single-node mode: Total storage = persistence_size × 1 pod + # - Cluster mode: Total storage = persistence_size × 3 vmstorage pods + # - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 24Gi total storage for cluster mode) + persistence_size: {{ telemetry_victoria_persistence_size | default('8Gi') | to_json }} + + # Duration (in hours) to retain victoria logs before they are deleted. + # Default: 168 (7 days) + retention_period: {{ telemetry_victoria_retention_period | default(168) }} + +# ============================================================================ +# KAFKA CONFIGURATION +# ============================================================================ +# Apache Kafka is a distributed streaming platform for storing telemetry data. +# Used for iDRAC telemetry when 'kafka' is enabled in idrac_telemetry_collection_type. +# Also used for LDMS telemetry when LDMS software is configured. +# +# NOTE: Kafka topics are auto-generated based on enabled features: +# - 'idrac' topic: Required when idrac_telemetry_support=true and 'kafka' is enabled +# - 'ldms' topic: Required when LDMS is configured in software_config.json +kafka_configurations: + # The amount of storage allocated for EACH Kafka persistent volume. + # IMPORTANT: Total Kafka storage = persistence_size × 6 pods + # - 3 Kafka brokers (each gets persistence_size storage) + # - 3 Kafka controllers (each gets persistence_size storage) + # - Example: 8Gi × 6 = 48Gi total Kafka storage + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 48Gi total Kafka storage) + persistence_size: {{ telemetry_kafka_persistence_size | default('8Gi') | to_json }} + + # The number of hours to retain Kafka logs before they are deleted. + # Default: 168 (7 days) + log_retention_hours: {{ telemetry_kafka_log_retention_hours | default(168) }} + + # The maximum size of Kafka logs (in bytes) before they are deleted. + # Default: -1 (unlimited) + log_retention_bytes: {{ telemetry_kafka_log_retention_bytes | default(-1) }} + + # The maximum size of Kafka log segments (in bytes) before they are deleted. + # Default: 1073741824 (1 GB) + log_segment_bytes: {{ telemetry_kafka_log_segment_bytes | default(1073741824) }} + + # Kafka Topic Partitions Configuration + # ---------------------------------------------------------------------------- + # Define the number of partitions for each Kafka topic. + # Increasing partitions can improve throughput but also increases storage/overhead. + # + # IMPORTANT: Topic names are FIXED and cannot be changed. + # - Topic names: Only 'idrac' and 'ldms' are allowed + # - Configurable: Only partition counts can be modified + # + # Topic Requirements (auto-validated): + # - 'idrac': Required when idrac_telemetry_support=true and 'kafka' is enabled + # - 'ldms': Required when LDMS software is configured in software_config.json + # + # Default partition counts: idrac=1, ldms=2 + topic_partitions: +{% for _topic in (telemetry_kafka_topic_partitions | default([])) %} + - name: {{ _topic.name | default('') | to_json }} + partitions: {{ _topic.partitions | default(1) }} +{% endfor %} + +# ============================================================================ +# LDMS (Lightweight Distributed Metric Service) CONFIGURATION +# ============================================================================ +# LDMS collects performance metrics from compute nodes (CPU, memory, network, etc.) +# and streams them to Kafka for storage and analysis. +# +# PREREQUISITE: To enable LDMS support, add the following to software_config.json: +# { +# "softwares": [ +# {"name": "ldms", "arch": ["x86_64", "aarch64"]} +# ] +# } +# +# When LDMS software is configured, the 'ldms' topic MUST be defined in +# kafka_configurations.topic_partitions above. +# +# LDMS Port Configurations +# Aggregator port on service k8s cluster +# Valid range: 6001-6100 +# Default: 6001 +ldms_agg_port: {{ telemetry_ldms_agg_port | default(6001) }} + +# Store daemon port on service k8s cluster +# Can be the same as ldms_agg_port +# Valid range: 6001-6100 +# Default: 6001 +ldms_store_port: {{ telemetry_ldms_store_port | default(6001) }} + +# Sampler port on compute nodes +# Valid range: 10001-10100 +# Default: 10001 +ldms_sampler_port: {{ telemetry_ldms_sampler_port | default(10001) }} + +# LDMS Sampler Plugin Configurations +# ---------------------------------------------------------------------------- +# Configure which metrics to collect from compute nodes and collection intervals. +# Each plugin collects specific system metrics. +# +# Parameters: +# - plugin_name: Name of the LDMS sampler plugin +# - config_parameters: Plugin-specific configuration (as a single string) +# - activation_parameters: Collection schedule in MICROSECONDS +# Format: "interval= offset=" +# Example: "interval=1000000" (1000000 microseconds = 1 second) +# "interval=1000000 offset=0" (1000000 microseconds with no offset) +# +# Available Plugins: +# - meminfo: Memory usage statistics +# - procstat2: Process statistics +# - vmstat: Virtual memory statistics +# - loadavg: System load average +# - procnetdev2: Network interface statistics +ldms_sampler_configurations: +{% if telemetry_ldms_sampler_configurations is none %} + null +{% else %} +{% for _plugin in (telemetry_ldms_sampler_configurations | default([])) %} + - plugin_name: {{ _plugin.plugin_name | default('') }} + config_parameters: {{ _plugin.config_parameters | default('') | to_json }} + activation_parameters: {{ _plugin.activation_parameters | default('interval=1000000') | to_json }} +{% endfor %} +{% endif %} diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index bc4ca7430a..722399b7d0 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -43,6 +43,39 @@ msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory" msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)" +# Local repo config transformation messages +msg_backup_local_repo_config_missing: "Backup local_repo_config.yml missing" +msg_local_repo_config_missing: "local_repo_config.yml missing" +msg_using_backup_local_repo_config: "Using backup local_repo_config.yml (backup not modified)" +msg_omnia_repo_url_rhel_x86_64_missing: "omnia_repo_url_rhel_x86_64 is mandatory" +msg_omnia_repo_url_rhel_aarch64_missing: "omnia_repo_url_rhel_aarch64 is mandatory" + +# Provision config transformation messages +msg_backup_provision_config_missing: "Backup provision_config.yml missing" +msg_provision_config_missing: "provision_config.yml missing" +msg_using_backup_provision_config: "Using backup provision_config.yml (backup not modified)" +msg_pxe_mapping_file_path_missing: "pxe_mapping_file_path is mandatory" + +# Storage config transformation messages +msg_backup_storage_config_missing: "Backup storage_config.yml missing" +msg_storage_config_missing: "storage_config.yml missing" +msg_using_backup_storage_config: "Using backup storage_config.yml (backup not modified)" +msg_nfs_client_params_missing: "nfs_client_params is mandatory" +msg_nfs_client_param_entry_missing_keys: "Each entry in nfs_client_params must define server_ip, server_share_path, client_share_path, and client_mount_options" +msg_powervault_missing_keys: "powervault_config (when present) must define ip (non-empty list), isci_initiators, and volume_id" + +# Omnia config transformation messages +msg_backup_omnia_config_missing: "Backup omnia_config.yml missing" +msg_omnia_config_missing: "omnia_config.yml missing" +msg_using_backup_omnia_config: "Using backup omnia_config.yml (backup not modified)" +msg_slurm_cluster_missing: "slurm_cluster is mandatory" +msg_service_k8s_cluster_missing: "service_k8s_cluster is mandatory" + +# Telemetry config transformation messages +msg_backup_telemetry_config_missing: "Backup telemetry_config.yml missing" +msg_telemetry_config_missing: "telemetry_config.yml missing" +msg_using_backup_telemetry_config: "Using backup telemetry_config.yml (backup not modified)" + ### Restore summary messages msg_restore_summary: | {{ restore_item.name }} restored from backup. @@ -66,6 +99,45 @@ msg_ha_config_transform_summary: | - Ensured service_k8s_cluster_ha is a list - Ensured virtual_ip_address is present +# Restore summary message for local repo config transformation +msg_local_repo_config_transform_summary: | + local_repo_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/local_repo_config.yml + Changes: + - Normalized repo URL keys to arch-specific schema + - Migrated omnia_registry to user_registry (when present) + - Ensured mandatory omnia_repo_url_rhel_* keys are present + +# Restore summary message for provision config transformation +msg_provision_config_transform_summary: | + provision_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/provision_config.yml + Changes: + - Ensured pxe_mapping_file_path, language, and default_lease_time are present + +# Restore summary message for storage config transformation +msg_storage_config_transform_summary: | + storage_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/storage_config.yml + Changes: + - Ensured nfs_client_params is present and entries contain required keys + +# Restore summary message for omnia config transformation +msg_omnia_config_transform_summary: | + omnia_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/omnia_config.yml + Changes: + - Ensured slurm_cluster and service_k8s_cluster are lists + - Ensured required sections are present + +# Restore summary message for telemetry config transformation +msg_telemetry_config_transform_summary: | + telemetry_config.yml upgraded to 2.1 format. + Backup preserved at: {{ backup_location }}/telemetry_config.yml + Changes: + - Rendered Omnia 2.1 telemetry template with values from 2.0 backup + - Applied schema defaults for missing fields + # === Input files to restore from backup === # Add input files here that should be copied from backup_location to input_project_dir # Each entry should have: @@ -78,11 +150,15 @@ msg_ha_config_transform_summary: | # - Files that are the same format in 2.0 and 2.1 # - Files where you want to preserve the backup values exactly # -# DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml) +# DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml, local_repo_config.yml, +# provision_config.yml, user_registry_credential.yml) restore_input_files: - name: software_config.json mode: '0644' validate_cmd: "python3 -m json.tool '{{ input_project_dir }}/software_config.json'" + - name: security_config.yml + mode: '0644' + validate_cmd: "python3 -c \"import yaml; yaml.safe_load(open('{{ input_project_dir }}/security_config.yml','r'))\"" - name: pxe_mapping_file.csv mode: '0644' validate_cmd: "" From 2cbce8f6ca9769937c3464fe3c8f0df0ea5c8006 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Tue, 10 Feb 2026 12:05:48 +0530 Subject: [PATCH 103/172] Update omnia.sh --- omnia.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/omnia.sh b/omnia.sh index 746fb8fd34..235cc1dbc1 100755 --- a/omnia.sh +++ b/omnia.sh @@ -988,6 +988,18 @@ show_help() { } install_omnia_core() { + # Detect existing Omnia 2.0 installation + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + # Read version from metadata inside container + current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + if [ "$current_version" = "2.0.0.0" ]; then + echo -e "${RED}ERROR: Existing Omnia 2.0 installation detected.${NC}" + echo -e "${YELLOW}To upgrade, run: $0 --upgrade${NC}" + echo -e "${YELLOW}For a fresh install, first run: $0 --uninstall${NC}" + exit 1 + fi + fi + local omnia_core_tag="1.1" local omnia_core_registry="" From 0c090cf22961491aaa3aa3c97e949033c839f1e7 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Tue, 10 Feb 2026 12:08:06 +0530 Subject: [PATCH 104/172] update reachability --- .../external_kafka_connect_details/tasks/main.yml | 10 ++++++++++ .../roles/external_kafka_connect_details/vars/main.yml | 4 ++++ .../external_victoria_connect_details/tasks/main.yml | 10 ++++++++++ .../external_victoria_connect_details/vars/main.yml | 4 ++++ 4 files changed, 28 insertions(+) diff --git a/utils/roles/external_kafka_connect_details/tasks/main.yml b/utils/roles/external_kafka_connect_details/tasks/main.yml index 96c6d0ca5f..3ee17c1c80 100644 --- a/utils/roles/external_kafka_connect_details/tasks/main.yml +++ b/utils/roles/external_kafka_connect_details/tasks/main.yml @@ -13,6 +13,16 @@ # limitations under the License. --- +- name: Validate service k8s controller connectivity + block: + - name: Wait for service k8s controller connection + ansible.builtin.wait_for_connection: + timeout: 30 + rescue: + - name: Fail when service k8s controller is not reachable + ansible.builtin.fail: + msg: "{{ kafka_preflight_err_service_k8s_controller_unreachable }}" + - name: Check kubectl presence ansible.builtin.command: kubectl version --client=true register: kubectl_check diff --git a/utils/roles/external_kafka_connect_details/vars/main.yml b/utils/roles/external_kafka_connect_details/vars/main.yml index 7a7d831275..be23cde089 100644 --- a/utils/roles/external_kafka_connect_details/vars/main.yml +++ b/utils/roles/external_kafka_connect_details/vars/main.yml @@ -37,6 +37,10 @@ kafka_preflight_err_ha_vip_missing: >- Failed to determine the service Kubernetes control plane VIP from High Availability config. Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}. +kafka_preflight_err_service_k8s_controller_unreachable: >- + Service Kubernetes controller is not reachable over SSH: {{ ansible_host | default(inventory_hostname) }}. + Ensure the service Kubernetes VIP is reachable and resolvable from the OIM host. + kafka_ome_ui_navigation_line1: "Configuration -> Remote Connectivity" kafka_ome_ui_enable_label: "Enable Kafka Connectivity" kafka_ome_auth_mode_value: "SSL" diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index e06b061828..1c3e98a516 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -13,6 +13,16 @@ # limitations under the License. --- +- name: Validate service k8s controller connectivity + block: + - name: Wait for service k8s controller connection + ansible.builtin.wait_for_connection: + timeout: 30 + rescue: + - name: Fail when service k8s controller is not reachable + ansible.builtin.fail: + msg: "{{ victoria_preflight_err_service_k8s_controller_unreachable }}" + - name: Check kubectl presence ansible.builtin.command: kubectl version --client=true register: kubectl_check diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml index c2de781fbf..f9a1fb72dd 100644 --- a/utils/roles/external_victoria_connect_details/vars/main.yml +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -38,6 +38,10 @@ victoria_preflight_err_ha_vip_missing: >- Failed to determine the service Kubernetes control plane VIP from High Availability config. Ensure service_k8s_cluster_ha[0].virtual_ip_address is set in: {{ k8s_ha_config_path }}. +victoria_preflight_err_service_k8s_controller_unreachable: >- + Service Kubernetes controller is not reachable over SSH: {{ ansible_host | default(inventory_hostname) }}. + Ensure the service Kubernetes VIP is reachable and resolvable from the OIM host. + victoria_sfm_ui_navigation: "Observability -> Settings -> Prometheus Remote Write" victoria_sfm_remote_write_target_name: "victoria" victoria_sfm_remote_write_message_version: "v1" From 44c3f435a71badb264e84ee5e88d4a30dd7ae4c8 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Tue, 10 Feb 2026 12:08:47 +0530 Subject: [PATCH 105/172] Update main.yml Signed-off-by: pullan1 --- utils/roles/external_victoria_connect_details/tasks/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index e06b061828..0fbf8a9fc5 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -164,7 +164,6 @@ vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}" vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}" victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt" - - name: Fail when LoadBalancer IPs are not available ansible.builtin.fail: From c007c366a7e5d9ef6fd6b66ea0bd621dd65eb746 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Tue, 10 Feb 2026 06:42:44 +0000 Subject: [PATCH 106/172] change iscsi_initiator field name Signed-off-by: Vrinda_Marwah --- .../module_utils/input_validation/schema/storage_config.json | 4 ++-- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 2 +- input/storage_config.yml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json index 114d88f525..e300410346 100644 --- a/common/library/module_utils/input_validation/schema/storage_config.json +++ b/common/library/module_utils/input_validation/schema/storage_config.json @@ -52,7 +52,7 @@ }, "powervault_config": { "type": "object", - "required": ["ip", "iscsi_initiators", "volume_id"], + "required": ["ip", "iscsi_initiator", "volume_id"], "properties": { "ip": { "description": "List of target controller IP addresses", @@ -70,7 +70,7 @@ "type": "integer" }, - "iscsi_initiators": { + "iscsi_initiator": { "description": "iSCSI initiator IQN", "type": "string", "pattern": "^iqn\\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index d99d9dc90f..9d8f6c0f38 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -92,7 +92,7 @@ PORTALS=({% for ip in powervault_config.ip %}"{{ ip }}" {% endfor %}) PORT="{{ powervault_config.port | default(3260) }}" - INITIATOR_IQN="{{ powervault_config.iscsi_initiators | default('') }}" + INITIATOR_IQN="{{ powervault_config.iscsi_initiator | default('') }}" VOLUME_ID="{{ powervault_config.volume_id | default('') }}" FS_TYPE="{{ powervault_config.fs_type | default('xfs') }}" MOUNT_OPTS="{{ powervault_config.mount_options | default('defaults,_netdev,noatime') }}" diff --git a/input/storage_config.yml b/input/storage_config.yml index 9492f15558..14ab13e0d1 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -22,7 +22,7 @@ # Mandatory when using PowerVault for persistent storage. # Below parameters are mandatory when powervault_config is defined # ip: A list of PowerVault controller ipv4 addresses used for iSCSI target discovery and login. - # iscsi_initiators: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array. + # iscsi_initiator: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array. # volume_id: This is the unique WWN/identifier for the specific volume that should be used for persistent storage. This value is used for multipath scanning to select the correct mapped device. # Below are the optional parameters when powervault_config is defined @@ -35,7 +35,7 @@ # ip: # - 172.1.2.3 # port: 3260 -# iscsi_initiators: iqn.initiator.com.example:7d7d7d7d7d7 +# iscsi_initiator: iqn.initiator.com.example:7d7d7d7d7d7 # volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 From 979dbd08402773f67e5c981ab479eefc6f102939 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Tue, 10 Feb 2026 12:13:56 +0530 Subject: [PATCH 107/172] Update main.yml --- utils/roles/external_victoria_connect_details/tasks/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 1c3e98a516..260c8376fd 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -174,7 +174,6 @@ vminsert_port: "{{ (vminsert_lb_port.stdout | trim) | default('') }}" vmselect_port: "{{ (vmselect_lb_port.stdout | trim) | default('') }}" victoria_tls_ca: "{{ victoria_tls_cert_dir }}/ca.crt" - - name: Fail when LoadBalancer IPs are not available ansible.builtin.fail: From a4fc5b8552c278c47d596d695197bc0a0b58590c Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Tue, 10 Feb 2026 06:48:38 +0000 Subject: [PATCH 108/172] change iscsi_initiator field value Signed-off-by: Vrinda_Marwah --- input/storage_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/storage_config.yml b/input/storage_config.yml index 14ab13e0d1..399bf42fd6 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -35,7 +35,7 @@ # ip: # - 172.1.2.3 # port: 3260 -# iscsi_initiator: iqn.initiator.com.example:7d7d7d7d7d7 +# iscsi_initiator: iqn.2025-01.com.dell:scontrol-node # volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 From 783e4f9dce1cce56dac56f4b11d731aa55962039 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 10 Feb 2026 12:32:56 +0530 Subject: [PATCH 109/172] Updated to take care of ansible lint issues --- .../tasks/transform_local_repo_config.yml | 9 ++- .../tasks/transform_storage_config.yml | 56 +++++++++---------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml index 20c95798b1..2b513e36ae 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml @@ -68,9 +68,12 @@ local_repo_user_repo_url_aarch64: "{{ backup_local_repo_config.user_repo_url_aarch64 | default([]) }}" local_repo_rhel_os_url_x86_64: "{{ backup_local_repo_config.rhel_os_url_x86_64 | default(backup_local_repo_config.rhel_os_url | default([])) }}" local_repo_rhel_os_url_aarch64: "{{ backup_local_repo_config.rhel_os_url_aarch64 | default([]) }}" - local_repo_omnia_repo_url_rhel_x86_64: "{{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}" - local_repo_omnia_repo_url_rhel_aarch64: "{{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }}" - local_repo_additional_repos_x86_64: "{{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }}" + local_repo_omnia_repo_url_rhel_x86_64: >- + {{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }} + local_repo_omnia_repo_url_rhel_aarch64: >- + {{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }} + local_repo_additional_repos_x86_64: >- + {{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }} local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}" - name: Fail if omnia_repo_url_rhel_x86_64 is missing diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml index 72b82aa7f8..3590f55995 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml @@ -68,35 +68,33 @@ ansible.builtin.fail: msg: "{{ msg_nfs_client_param_entry_missing_keys }}" when: >- - {{ - ( - storage_nfs_client_params - | selectattr('server_ip', 'undefined') - | list - | length - ) > 0 - or - ( - storage_nfs_client_params - | selectattr('server_share_path', 'undefined') - | list - | length - ) > 0 - or - ( - storage_nfs_client_params - | selectattr('client_share_path', 'undefined') - | list - | length - ) > 0 - or - ( - storage_nfs_client_params - | selectattr('client_mount_options', 'undefined') - | list - | length - ) > 0 - }} + ( + storage_nfs_client_params + | selectattr('server_ip', 'undefined') + | list + | length + ) > 0 + or + ( + storage_nfs_client_params + | selectattr('server_share_path', 'undefined') + | list + | length + ) > 0 + or + ( + storage_nfs_client_params + | selectattr('client_share_path', 'undefined') + | list + | length + ) > 0 + or + ( + storage_nfs_client_params + | selectattr('client_mount_options', 'undefined') + | list + | length + ) > 0 - name: Write storage_config.yml in Omnia 2.1 format ansible.builtin.template: From 717820cca7efd5d149a62354d4fcd7ee61bca689 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 10 Feb 2026 12:38:35 +0530 Subject: [PATCH 110/172] Updating to take care of ansible lint issues --- .../tasks/transform_local_repo_config.yml | 33 ++++++++++++++----- .../tasks/transform_storage_config.yml | 31 +++-------------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml index 2b513e36ae..4b8ac8e3ec 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml @@ -64,16 +64,33 @@ - name: Normalize repo url keys to 2.1 schema ansible.builtin.set_fact: - local_repo_user_repo_url_x86_64: "{{ backup_local_repo_config.user_repo_url_x86_64 | default(backup_local_repo_config.user_repo_url | default([])) }}" + local_repo_user_repo_url_x86_64: "{{ + backup_local_repo_config.user_repo_url_x86_64 | + default(backup_local_repo_config.user_repo | + default([])) + }}" local_repo_user_repo_url_aarch64: "{{ backup_local_repo_config.user_repo_url_aarch64 | default([]) }}" - local_repo_rhel_os_url_x86_64: "{{ backup_local_repo_config.rhel_os_url_x86_64 | default(backup_local_repo_config.rhel_os_url | default([])) }}" + local_repo_rhel_os_url_x86_64: "{{ + backup_local_repo_config.rhel_os_url_x86_64 | + default(backup_local_repo_config.rhel_os_url | + default([])) + }}" local_repo_rhel_os_url_aarch64: "{{ backup_local_repo_config.rhel_os_url_aarch64 | default([]) }}" - local_repo_omnia_repo_url_rhel_x86_64: >- - {{ backup_local_repo_config.omnia_repo_url_rhel_x86_64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }} - local_repo_omnia_repo_url_rhel_aarch64: >- - {{ backup_local_repo_config.omnia_repo_url_rhel_aarch64 | default(backup_local_repo_config.omnia_repo_url_rhel | default([])) }} - local_repo_additional_repos_x86_64: >- - {{ backup_local_repo_config.additional_repos_x86_64 | default(backup_local_repo_config.additional_repos | default([])) }} + local_repo_omnia_repo_url_rhel_x86_64: "{{ + backup_local_repo_config.omnia_repo_url_rhel_x86_64 | + default(backup_local_repo_config.omnia_repo_url_rhel | + default([])) + }}" + local_repo_omnia_repo_url_rhel_aarch64: "{{ + backup_local_repo_config.omnia_repo_url_rhel_aarch64 | + default(backup_local_repo_config.omnia_repo_url_rhel | + default([])) + }}" + local_repo_additional_repos_x86_64: "{{ + backup_local_repo_config.additional_repos_x86_64 | + default(backup_local_repo_config.additional_repos | + default([])) + }}" local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}" - name: Fail if omnia_repo_url_rhel_x86_64 is missing diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml index 3590f55995..54dbf07bc0 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml @@ -68,33 +68,10 @@ ansible.builtin.fail: msg: "{{ msg_nfs_client_param_entry_missing_keys }}" when: >- - ( - storage_nfs_client_params - | selectattr('server_ip', 'undefined') - | list - | length - ) > 0 - or - ( - storage_nfs_client_params - | selectattr('server_share_path', 'undefined') - | list - | length - ) > 0 - or - ( - storage_nfs_client_params - | selectattr('client_share_path', 'undefined') - | list - | length - ) > 0 - or - ( - storage_nfs_client_params - | selectattr('client_mount_options', 'undefined') - | list - | length - ) > 0 + (storage_nfs_client_params | selectattr('server_ip', 'undefined') | list | length) > 0 or + (storage_nfs_client_params | selectattr('server_share_path', 'undefined') | list | length) > 0 or + (storage_nfs_client_params | selectattr('client_share_path', 'undefined') | list | length) > 0 or + (storage_nfs_client_params | selectattr('client_mount_options', 'undefined') | list | length) > 0 - name: Write storage_config.yml in Omnia 2.1 format ansible.builtin.template: From 573c5ef6b84b72ded8a742cf6751c4873dc49b90 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 10 Feb 2026 13:17:23 +0530 Subject: [PATCH 111/172] Added /etc/hosts check --- .../common_utils/slurm_conf_utils.py | 2 +- .../slurm_config/tasks/check_ctld_running.yml | 34 ++++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 26f24762aa..3d5b259c2d 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -865,7 +865,7 @@ def parse_slurm_conf(file_path, conf_name, validate): if validate and skey not in current_conf: raise ValueError( f"Invalid key while parsing {file_path}: {skey}") - if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY: + if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY or len(tmp_dict) > 1: slurm_dict[list(tmp_dict.keys())[0]] = list( slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict] elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV: diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 52984c2afb..126261dfcd 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -43,16 +43,40 @@ - ansible_facts.services['slurmctld.service'] is defined - ansible_facts.services['slurmctld.service'].state == 'running' + - name: Check reachability of hosts in ip_name_map + ansible.builtin.wait_for: + host: "{{ host }}" + port: 22 + timeout: 10 + state: started + delegate_to: localhost + loop: "{{ ip_name_map.values() | list }}" + loop_control: + loop_var: host + register: ip_map_ssh_check + ignore_errors: true + ignore_unreachable: true + + - name: Build list of reachable hosts from ip_name_map + ansible.builtin.set_fact: + reachable_hosts: "{{ ip_map_ssh_check.results | rejectattr('failed', 'true') | map(attribute='host') | list }}" + - name: Update /etc/hosts with controller hostname and IP ansible.builtin.lineinfile: path: /etc/hosts - regexp: '^{{ ip.value }}\s+{{ ip.key }}' - line: "{{ ip.value }} {{ ip.key }}" + regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}' + line: "{{ host_entry.value }} {{ host_entry.key }}" state: present - loop: "{{ ip_name_map | dict2items }}" + loop: "{{ reachable_hosts | product(ip_name_map | dict2items) | list }}" loop_control: - loop_var: ip - delegate_to: "{{ item }}" + loop_var: host_combo + vars: + target_host: "{{ host_combo[0] }}" + host_entry: "{{ host_combo[1] }}" + delegate_to: "{{ target_host }}" + when: reachable_hosts | length > 0 + ignore_errors: true + ignore_unreachable: true - name: Trigger the scontrol reconfigure ansible.builtin.command: scontrol reconfigure From 711817d8e89151744f965515505a4c9c2d90a070 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 10 Feb 2026 13:46:46 +0530 Subject: [PATCH 112/172] Lint fixes --- .../input_validation/common_utils/slurm_conf_utils.py | 2 +- discovery/roles/slurm_config/tasks/check_ctld_running.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 3d5b259c2d..0c98f64e6c 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -766,7 +766,7 @@ def validate_config_types(conf_dict, conf_name, module): if not current_conf: return {'invalid_keys': [], 'type_errors': []} # module.fail_json(msg=f"Invalid configuration name: {conf_name}", conf_dict=conf_dict, current_conf=current_conf) - module.warn(conf_name) + # module.warn(conf_name) invalid_keys = list( set(conf_dict.keys()).difference(set(current_conf.keys()))) type_errors = [] diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 126261dfcd..5af73f984c 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -75,8 +75,8 @@ host_entry: "{{ host_combo[1] }}" delegate_to: "{{ target_host }}" when: reachable_hosts | length > 0 - ignore_errors: true ignore_unreachable: true + failed_when: true - name: Trigger the scontrol reconfigure ansible.builtin.command: scontrol reconfigure From 851bc46f3faeea0d1da96a70abf3adfa11017c43 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 10 Feb 2026 13:53:06 +0530 Subject: [PATCH 113/172] input doc update --- input/omnia_config.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 032fa77ce0..a5b448fc0d 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -38,6 +38,13 @@ # cgroup # slurmdbd # gres +# acct_gather +# helpers +# job_container +# mpi +# oci +# topology +# burst_buffer # Thes files will be written into the slurm_config directory with .conf suffix slurm_cluster: From 3226780a605caf0f71d1820db0cdd047f183d854 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 10 Feb 2026 10:01:47 +0000 Subject: [PATCH 114/172] lint issues fixed --- .../tasks/validate_path_overrides.yml | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/validate_path_overrides.yml b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml index 140b1d4bda..c4a1783b02 100644 --- a/discovery/roles/slurm_config/tasks/validate_path_overrides.yml +++ b/discovery/roles/slurm_config/tasks/validate_path_overrides.yml @@ -22,8 +22,16 @@ - slurm_merged_dict is defined - slurm_merged_dict.get(item) is defined - slurm_merged_dict.get(item) is not none - - (slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | length > 0) or (slurm_merged_dict.get(item) is iterable and slurm_merged_dict.get(item) | list | length > 0) - - not ((slurm_merged_dict.get(item) is string and slurm_merged_dict.get(item) | regex_search('^/')) or (slurm_merged_dict.get(item) is iterable and (slurm_merged_dict.get(item) | first) | regex_search('^/'))) + - >- + (slurm_merged_dict.get(item) is string + and slurm_merged_dict.get(item) | length > 0) + or (slurm_merged_dict.get(item) is iterable + and slurm_merged_dict.get(item) | list | length > 0) + - >- + not ((slurm_merged_dict.get(item) is string + and slurm_merged_dict.get(item) | regex_search('^/')) + or (slurm_merged_dict.get(item) is iterable + and (slurm_merged_dict.get(item) | first) | regex_search('^/'))) loop: - SlurmctldLogFile - SlurmdLogFile @@ -61,8 +69,16 @@ - slurmdbd_merged_dict is defined - slurmdbd_merged_dict.get(item) is defined - slurmdbd_merged_dict.get(item) is not none - - (slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | length > 0) or (slurmdbd_merged_dict.get(item) is iterable and slurmdbd_merged_dict.get(item) | list | length > 0) - - not ((slurmdbd_merged_dict.get(item) is string and slurmdbd_merged_dict.get(item) | regex_search('^/')) or (slurmdbd_merged_dict.get(item) is iterable and (slurmdbd_merged_dict.get(item) | first) | regex_search('^/'))) + - >- + (slurmdbd_merged_dict.get(item) is string + and slurmdbd_merged_dict.get(item) | length > 0) + or (slurmdbd_merged_dict.get(item) is iterable + and slurmdbd_merged_dict.get(item) | list | length > 0) + - >- + not ((slurmdbd_merged_dict.get(item) is string + and slurmdbd_merged_dict.get(item) | regex_search('^/')) + or (slurmdbd_merged_dict.get(item) is iterable + and (slurmdbd_merged_dict.get(item) | first) | regex_search('^/'))) loop: - LogFile - PidFile @@ -77,7 +93,15 @@ - cgroup_merged_dict is defined - cgroup_merged_dict.get(item) is defined - cgroup_merged_dict.get(item) is not none - - (cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | length > 0) or (cgroup_merged_dict.get(item) is iterable and cgroup_merged_dict.get(item) | list | length > 0) - - not ((cgroup_merged_dict.get(item) is string and cgroup_merged_dict.get(item) | regex_search('^/')) or (cgroup_merged_dict.get(item) is iterable and (cgroup_merged_dict.get(item) | first) | regex_search('^/'))) + - >- + (cgroup_merged_dict.get(item) is string + and cgroup_merged_dict.get(item) | length > 0) + or (cgroup_merged_dict.get(item) is iterable + and cgroup_merged_dict.get(item) | list | length > 0) + - >- + not ((cgroup_merged_dict.get(item) is string + and cgroup_merged_dict.get(item) | regex_search('^/')) + or (cgroup_merged_dict.get(item) is iterable + and (cgroup_merged_dict.get(item) | first) | regex_search('^/'))) loop: - CgroupMountpoint From 47f028a58d513bbdd40abb53d14d35fc809f997b Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 10 Feb 2026 15:48:03 +0530 Subject: [PATCH 115/172] Enhanced example --- input/omnia_config.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/input/omnia_config.yml b/input/omnia_config.yml index a5b448fc0d..bb5a4f06fa 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -54,6 +54,13 @@ slurm_cluster: # slurm: # SlurmctldTimeout: 60 # SlurmdTimeout: 150 + # NodeName: + # - NodeName: newnode1 + # CPUs: 16 + # RealMemory: 64000 + # - NodeName: newnode2 + # CPUs: 16 + # RealMemory: 64000 # cgroup: # CgroupPlugin: autodetect # ConstrainCores: True From 720facebb98c286fc913d6bb6441e1e4783e43c6 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Tue, 10 Feb 2026 10:39:55 +0000 Subject: [PATCH 116/172] slurmd fix --- .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 2 ++ .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 2 ++ 2 files changed, 4 insertions(+) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index a81d564ba6..3dc8f65514 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -468,4 +468,6 @@ - /root/ldms_sampler.sh {% endif %} + - systemctl restart slurmd + - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 5d930bef47..62a4e9e063 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -483,4 +483,6 @@ - /root/ldms_sampler.sh {% endif %} + - systemctl restart slurmd + - echo "Cloud-Init has completed successfully." From b6e5b8b6185496f0dffb202d30468376174bafdb Mon Sep 17 00:00:00 2001 From: pullan1 Date: Tue, 10 Feb 2026 16:43:26 +0530 Subject: [PATCH 117/172] rpm_file type handling in pulp Signed-off-by: pullan1 --- .../input_validation/common_utils/config.py | 1 + .../library/module_utils/local_repo/config.py | 7 +- .../local_repo/download_common.py | 154 ++++++++++++++++++ common/library/modules/parallel_tasks.py | 11 +- common/library/modules/pulp_cleanup.py | 2 +- 5 files changed, 168 insertions(+), 7 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index e0b5b0ea46..e6e8a09042 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -146,6 +146,7 @@ TYPE_REQUIREMENTS = { "rpm": ["package", "repo_name"], "rpm_list": ["package_list", "repo_name"], + "rpm_file": ["package", "url"], "ansible_galaxy_collection": ["package", "version"], "git": ["package", "version", "url"], "image": ["package", ["tag", "digest"]], # Special: one of tag or digest diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index e26e8a6e71..0518e2bb01 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -51,7 +51,7 @@ # Used by software_utils.py # ---------------------------- PACKAGE_TYPES = ['rpm', 'deb', 'tarball', 'image', 'manifest', 'git', - 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list'] + 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file'] CSV_COLUMNS = {"column1": "name", "column2": "status"} SOFTWARE_CONFIG_SUBDIR = "config" RPM_LABEL_TEMPLATE = "RPMs for {key}" @@ -183,7 +183,10 @@ "list_repositories": "pulp rpm repository list", "list_remotes": "pulp rpm remote list", "list_distributions": "pulp rpm distribution list", - "orphan_cleanup": "pulp orphan cleanup --protection-time 0" + "orphan_cleanup": "pulp orphan cleanup --protection-time 0", + "list_all_publications": "pulp rpm publication list", + "upload_content": "pulp rpm content upload --repository %s --file %s", + "update_distribution_repo_config": "pulp rpm distribution update --name %s --generate-repo-config" } # ---------------------------- diff --git a/common/library/module_utils/local_repo/download_common.py b/common/library/module_utils/local_repo/download_common.py index f139384b23..892725b207 100644 --- a/common/library/module_utils/local_repo/download_common.py +++ b/common/library/module_utils/local_repo/download_common.py @@ -35,6 +35,7 @@ from ansible.module_utils.local_repo.common_functions import load_pulp_config from ansible.module_utils.local_repo.config import ( pulp_file_commands, + pulp_rpm_commands, CLI_FILE_PATH, POST_TIMEOUT, ISO_POLL_VAL, @@ -1023,3 +1024,156 @@ def process_pip(package, repo_store_path, status_file_path, cluster_os_type, cl logger.info("#" * 30 + f" {process_pip.__name__} end " + "#" * 30) return status + +def process_rpm_file(package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, arc, logger): + """ + Process an RPM file package by downloading it and setting up a Pulp RPM repository. + + Args: + package (dict): A dictionary containing the package information. + repo_store_path (str): The path to the repository store. + status_file_path (str): The path to the status file. + cluster_os_type (str): The type of the cluster operating system. + cluster_os_version (str): The version of the cluster operating system. + arc (str): The architecture (x86_64 or aarch64). + logger (logging.Logger): The logger instance. + + Returns: + str: The status of the RPM file package processing. + """ + logger.info("#" * 30 + f" {process_rpm_file.__name__} start " + "#" * 30) + + try: + package_name = package['package'] + url = package.get('url', None) + package_type = package['type'] + repo_name = arc.lower() + "_" + package_name + + if not url: + logger.error(f"No URL provided for RPM file package: {package_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + url = shlex.quote(url).strip("'\"") + logger.info(f"Processing RPM File Package: {package_name}, URL: {url}") + + # Create rpm_file directory structure + rpm_file_directory = os.path.join( + repo_store_path, "offline_repo", "cluster", arc.lower(), + cluster_os_type, cluster_os_version, "rpm_file", package_name + ) + os.makedirs(rpm_file_directory, exist_ok=True) + + # Extract filename from URL + download_file_name = url.split('/')[-1] + rpm_file_path = os.path.join(rpm_file_directory, download_file_name) + + # Step 1: Download the RPM file + logger.info("Step 1: Downloading RPM file...") + if os.path.exists(rpm_file_path): + logger.info(f"RPM file already exists: {rpm_file_path}") + else: + # Verify URL exists + subprocess.run(['wget', '-q', '--spider', '--tries=1', url], check=True) + + # Download the file + download_command = f"wget -O {shlex.quote(rpm_file_path)} {url}" + if not execute_command(download_command, logger): + logger.error(f"Failed to download RPM file from: {url}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 2: CREATE A NEW RPM REPOSITORY IN PULP (if it doesn't exist) + logger.info("Step 2: Creating RPM repository in Pulp...") + # Check if repository already exists + if execute_command(pulp_rpm_commands["show_repository"] % repo_name, logger): + logger.info(f"RPM repository {repo_name} already exists. Skipping creation.") + else: + logger.info(f"Creating RPM repository: {repo_name}") + if not execute_command(pulp_rpm_commands["create_repository"] % repo_name, logger): + logger.error(f"Failed to create RPM repository: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 3: UPLOAD THE RPM INTO THE REPO + logger.info("Step 3: Uploading RPM to repository...") + upload_command = pulp_rpm_commands["upload_content"] % (repo_name, shlex.quote(rpm_file_path)) + if not execute_command(upload_command, logger): + logger.error(f"Failed to upload RPM to repository: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 4: PUBLISH THE REPOSITORY + logger.info("Step 4: Publishing repository...") + if not execute_command(pulp_rpm_commands["publish_repository"] % repo_name, logger): + logger.error(f"Failed to publish repository: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 5: CREATE A DISTRIBUTION FOR THE REPO (if it doesn't exist) + logger.info("Step 5: Creating distribution...") + + # Check if distribution already exists + if execute_command(pulp_rpm_commands["check_distribution"] % repo_name, logger): + logger.info(f"Distribution {repo_name} already exists. Skipping creation.") + else: + logger.info(f"Creating distribution: {repo_name}") + # Get the publication href + pub_result = execute_command(pulp_rpm_commands["list_all_publications"], logger, type_json=True) + if not pub_result or not pub_result.get("stdout"): + logger.error("Failed to get publication list") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + publications = pub_result["stdout"] + if not publications: + logger.error("No publications found") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + latest_publication = publications[0] + publication_href = latest_publication.get("pulp_href") + + if not publication_href: + logger.error("No publication href found") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + base_path = f" opt/omnia/offline_repo/cluster/{arc}/rhel/10.0/rpms/{repo_name}" + dist_create_command = pulp_rpm_commands["distribute_repository"] % (repo_name, base_path, repo_name) + if not execute_command(dist_create_command, logger): + logger.error(f"Failed to create distribution: {repo_name}") + status = "Failed" + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + return status + + # Step 6: ENABLE AUTO-GENERATION OF .repo FILES + logger.info("Step 6: Enabling auto-generation of .repo files...") + update_command = pulp_rpm_commands["update_distribution_repo_config"] % repo_name + if not execute_command(update_command, logger): + logger.warning(f"Failed to enable repo config generation for: {repo_name}") + # Not a critical failure, continue + + logger.info(f"RPM file package {package_name} processed successfully!") + status = "Success" + + except subprocess.CalledProcessError as e: + logger.error(f"Error executing RPM file commands: {e}") + status = "Failed" + except Exception as e: + logger.error(f"Error processing RPM file package: {e}") + status = "Failed" + + finally: + # Write the status to the file + write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock, repo_name) + logger.info("#" * 30 + f" {process_rpm_file.__name__} end " + "#" * 30) + return status \ No newline at end of file diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 4fd910d027..5951a525b2 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -28,7 +28,8 @@ process_shell, process_ansible_galaxy_collection, process_iso, - process_pip + process_pip, + process_rpm_file ) from ansible.module_utils.local_repo.download_image import process_image from ansible.module_utils.local_repo.download_rpm import process_rpm @@ -175,6 +176,8 @@ def determine_function(task, repo_store_path, csv_file_path, user_data, version_ return process_pip, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] if task_type == "image": return process_image, [task, status_file, version_variables, user_registries, docker_username, docker_password] + if task_type == "rpm_file": + return process_rpm_file, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] if task_type == "rpm": return process_rpm, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, repo_config_value, arc] @@ -251,13 +254,13 @@ def generate_software_status_table(status_dict,slogger): table.field_names = ["Name", "Status"] for name, status in items: table.add_row([name, status.lower()]) - + tables.append(table.get_string()) slogger.info(f"Completed table for {arch}") - + slogger.info("Software status table generation completed successfully") return "\n\n".join(tables) - + except Exception as e: slogger.error(f"Error occurred while generating software status table: {e}") return f"Error: {e}" diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index 6f80e82f83..00ed27d0dd 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -630,7 +630,7 @@ def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[ logger.info(f"Processing row: {row}") # For RPMs, check if they belong to the deleted repository - if row_type == 'rpm': + if row_type == 'rpm' or row_type == 'rpm_file': if has_repo_column and rpm_repo == repo_name: removed = True logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") From bd71162f9a6d6740b1c369b3e69cd9685405c50a Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 10 Feb 2026 18:46:34 +0530 Subject: [PATCH 118/172] Removing unrequired logic for storage_config and provision_config --- .../tasks/transform_provision_config.yml | 17 +---------------- .../tasks/transform_storage_config.yml | 15 --------------- .../templates/storage_config.j2 | 17 ----------------- .../roles/import_input_parameters/vars/main.yml | 15 +++++++++------ 4 files changed, 10 insertions(+), 54 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml index 71e9ee0dc2..42598d59bc 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml @@ -44,25 +44,10 @@ - name: Normalize provision_config.yml values ansible.builtin.set_fact: - provision_pxe_mapping_file_path_raw: >- - {{ - backup_provision_config.pxe_mapping_file_path - | default('/opt/omnia/input/project_default/pxe_mapping_file.csv') - }} + provision_pxe_mapping_file_path: "{{ backup_provision_config.pxe_mapping_file_path | default('pxe_mapping_file.csv') }}" provision_language: "{{ backup_provision_config.language | default('en_US.UTF-8') }}" provision_default_lease_time: "{{ backup_provision_config.default_lease_time | default('86400') }}" -- name: Rewrite legacy pxe_mapping_file_path to current project input directory - ansible.builtin.set_fact: - provision_pxe_mapping_file_path: >- - {{ - ( - provision_pxe_mapping_file_path_raw - | string - | regex_replace('^/opt/omnia/input/project_default/', input_project_dir ~ '/') - ) - }} - - name: Fail if pxe_mapping_file_path is missing ansible.builtin.fail: msg: "{{ msg_pxe_mapping_file_path_missing }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml index 54dbf07bc0..8a167df6fb 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml @@ -45,19 +45,6 @@ - name: Normalize storage_config.yml values ansible.builtin.set_fact: storage_nfs_client_params: "{{ backup_storage_config.nfs_client_params | default([]) }}" - storage_powervault_config: "{{ backup_storage_config.powervault_config | default(none) }}" - storage_has_powervault: "{{ backup_storage_config.powervault_config is defined }}" - -- name: Fail if powervault_config is present but missing mandatory keys - ansible.builtin.fail: - msg: "{{ msg_powervault_missing_keys }}" - when: - - storage_has_powervault - - storage_powervault_config.ip is not defined or (storage_powervault_config.ip | default([]) | length) == 0 - or storage_powervault_config.isci_initiators is not defined - or (storage_powervault_config.isci_initiators | string | trim) == '' - or storage_powervault_config.volume_id is not defined - or (storage_powervault_config.volume_id | string | trim) == '' - name: Fail if nfs_client_params is missing ansible.builtin.fail: @@ -80,8 +67,6 @@ mode: "{{ default_file_mode }}" vars: storage_nfs_client_params: "{{ storage_nfs_client_params }}" - storage_powervault_config: "{{ storage_powervault_config }}" - storage_has_powervault: "{{ storage_has_powervault }}" - name: Validate YAML syntax of transformed storage_config.yml ansible.builtin.command: diff --git a/upgrade/roles/import_input_parameters/templates/storage_config.j2 b/upgrade/roles/import_input_parameters/templates/storage_config.j2 index 1c695a19a5..f6be3642c4 100644 --- a/upgrade/roles/import_input_parameters/templates/storage_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/storage_config.j2 @@ -43,23 +43,6 @@ # isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 # volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 -{% if storage_has_powervault %} -powervault_config: -{% if storage_powervault_config.ip is defined %} - ip: -{% for _ip in (storage_powervault_config.ip | default([])) %} - - {{ _ip }} -{% endfor %} -{% else %} - ip: [] -{% endif %} -{% if storage_powervault_config.port is defined %} - port: {{ storage_powervault_config.port }} -{% endif %} - isci_initiators: {{ storage_powervault_config.isci_initiators | default('') }} - volume_id: {{ storage_powervault_config.volume_id | default('') }} -{% endif %} - # -----------------------------NFS------------------------------------------------ # This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 722399b7d0..65e8b65f38 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -57,12 +57,15 @@ msg_using_backup_provision_config: "Using backup provision_config.yml (backup no msg_pxe_mapping_file_path_missing: "pxe_mapping_file_path is mandatory" # Storage config transformation messages -msg_backup_storage_config_missing: "Backup storage_config.yml missing" -msg_storage_config_missing: "storage_config.yml missing" -msg_using_backup_storage_config: "Using backup storage_config.yml (backup not modified)" -msg_nfs_client_params_missing: "nfs_client_params is mandatory" -msg_nfs_client_param_entry_missing_keys: "Each entry in nfs_client_params must define server_ip, server_share_path, client_share_path, and client_mount_options" -msg_powervault_missing_keys: "powervault_config (when present) must define ip (non-empty list), isci_initiators, and volume_id" +msg_backup_storage_config_missing: "storage_config.yml not found in backup at {{ backup_location }}/storage_config.yml" +msg_storage_config_missing: "storage_config.yml not found at {{ input_project_dir }}/storage_config.yml" +msg_nfs_client_params_missing: "storage_config.yml must define nfs_client_params with at least one entry" +msg_nfs_client_param_entry_missing_keys: "Each nfs_client_params entry must define server_ip, server_share_path, and client_share_path" +msg_using_backup_storage_config: "Transforming storage_config.yml from backup at {{ backup_location }}/storage_config.yml" +msg_storage_config_transform_summary: |- + Transformed storage_config.yml from Omnia 2.0 to 2.1: + - Preserved nfs_client_params from backup + - Applied schema defaults for missing fields # Omnia config transformation messages msg_backup_omnia_config_missing: "Backup omnia_config.yml missing" From 8b26ad8102716c2833330b56f8c5df6f2fa2368e Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 10 Feb 2026 19:52:08 +0530 Subject: [PATCH 119/172] Update main.yml --- upgrade/roles/import_input_parameters/vars/main.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 65e8b65f38..c27f111cde 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -62,10 +62,6 @@ msg_storage_config_missing: "storage_config.yml not found at {{ input_project_di msg_nfs_client_params_missing: "storage_config.yml must define nfs_client_params with at least one entry" msg_nfs_client_param_entry_missing_keys: "Each nfs_client_params entry must define server_ip, server_share_path, and client_share_path" msg_using_backup_storage_config: "Transforming storage_config.yml from backup at {{ backup_location }}/storage_config.yml" -msg_storage_config_transform_summary: |- - Transformed storage_config.yml from Omnia 2.0 to 2.1: - - Preserved nfs_client_params from backup - - Applied schema defaults for missing fields # Omnia config transformation messages msg_backup_omnia_config_missing: "Backup omnia_config.yml missing" From c4a559fbe1b42394be88f545b6993636030b0e6b Mon Sep 17 00:00:00 2001 From: mcas Date: Wed, 11 Feb 2026 11:16:26 +0530 Subject: [PATCH 120/172] aarch chnages --- ...-group-login_compiler_node_aarch64.yaml.j2 | 135 ++++++++---------- .../ci-group-slurm_node_aarch64.yaml.j2 | 27 ++++ .../ci-group-slurm_node_x86_64.yaml.j2 | 17 +-- .../hpc_tools/configure_nvhpc_env.sh.j2 | 4 +- .../hpc_tools/export_nvhpc_env.sh.j2 | 4 +- .../hpc_tools/install_nvhpc_sdk.sh.j2 | 22 ++- .../templates/hpc_tools/install_openmpi.sh.j2 | 12 +- .../templates/hpc_tools/install_ucx.sh.j2 | 12 +- discovery/roles/slurm_config/vars/main.yml | 4 + .../aarch64/rhel/10.0/slurm_custom.json | 6 + 10 files changed, 152 insertions(+), 91 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index bc3068843a..efef2715c6 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -183,6 +183,18 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + - path: /usr/local/bin/install_openmpi.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_openmpi.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_ucx.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_ucx.sh.j2') | indent(12) }} + - path: /etc/hosts append: true content: | @@ -200,6 +212,18 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -218,6 +242,39 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} + # Add NFS entry and mount + - mkdir -p {{ client_mount_path }} + - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -a +{% endif %} + +{% if hostvars['localhost']['ucx_support'] %} + - echo "===== UCX Setup =====" + - echo "UCX support is enabled." + - /usr/local/bin/install_ucx.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_ucx.sh" + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['openmpi_support'] %} + - echo "===== OpenMPI Setup =====" + - echo "OpenMPI support is enabled." + - /usr/local/bin/install_openmpi.sh + # - echo "Build script available at" + # - echo " /usr/local/bin/install_openmpi.sh" + # - echo "Run UCX installation first if UCX support is enabled." + # - echo "NFS must be mounted at {{ client_mount_path }} before running." +{% endif %} + +{% if hostvars['localhost']['ldms_support'] %} + - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log + + - /root/ldms_sampler.sh +{% endif %} + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh @@ -279,79 +336,7 @@ {% endif %} -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} - # Add NFS entry and mount - - mkdir -p {{ client_mount_path }} - - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - - mount -a -{% endif %} - -{% if hostvars['localhost']['ucx_support'] %} - # UCX build and install - - | - UCX_BIN={{ client_mount_path }}/benchmarks/ucx - mkdir -p {{ client_mount_path }}/compile/ucx - mkdir -p {{ client_mount_path }}/benchmarks/ucx - cd {{ client_mount_path }}/compile/ucx - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/aarch64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz -O ucx.tar.gz - tar xzf ucx.tar.gz - cd ucx-* - mkdir -p build - cd build - ../contrib/configure-release --prefix={{ client_mount_path }}/benchmarks/ucx - make -j 8 - make install -{% endif %} - -{% if hostvars['localhost']['openmpi_support'] %} - # OpenMPI build and install with UCX + Slurm detection - - | - OPENMPI_INSTALL_PREFIX="{{ client_mount_path }}/benchmarks/openmpi" - OPENMPI_SRC="{{ client_mount_path }}/compile/openmpi" - mkdir -p $OPENMPI_SRC - mkdir -p $OPENMPI_INSTALL_PREFIX - - cd $OPENMPI_SRC - wget --no-check-certificate https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/aarch64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz -O openmpi.tar.gz - - tar xzf openmpi.tar.gz - cd openmpi-* - mkdir -p build - - # Check Slurm - if sinfo >/dev/null 2>&1; then - SLURM_FLAG="--with-slurm=yes --with-munge=/usr" - else - SLURM_FLAG="--with-slurm=no" - fi - - # Check UCX - if [ -x "{{ client_mount_path }}/benchmarks/ucx/bin/ucx_info" ]; then - {{ client_mount_path }}/benchmarks/ucx/bin/ucx_info -v - if [ $? -eq 0 ]; then - UCX_FLAG="--with-ucx={{ client_mount_path }}/benchmarks/ucx" - else - echo "ucx_info failed, disabling UCX" - UCX_FLAG="" - fi - else - echo "ucx_info not found, disabling UCX" - UCX_FLAG="" - fi - - cd build - ../configure --prefix=$OPENMPI_INSTALL_PREFIX \ - --enable-mpi1-compatibility \ - --enable-prte-prefix-by-default \ - $SLURM_FLAG $UCX_FLAG 2>&1 | tee config.out - - make -j 8 - make install -{% endif %} - -{% if hostvars['localhost']['ldms_support'] %} - - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - - /root/ldms_sampler.sh -{% endif %} + # nvidia sdk install + - /usr/local/bin/install_nvhpc_sdk.sh + - /usr/local/bin/configure_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 9b3ac1a501..59d5520440 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -396,6 +396,24 @@ permissions: '0644' content: | {{ lookup('template', 'templates/nodes/apptainer_mirror.conf.j2') | indent(12) }} + + - path: /usr/local/bin/configure_ucx_openmpi_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/configure_ucx_openmpi_env.sh.j2') | indent(12) }} + + - path: /usr/local/bin/setup_nvhpc_sdk.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/setup_nvhpc_sdk.sh.j2') | indent(12) }} + + - path: /usr/local/bin/export_nvhpc_env.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/export_nvhpc_env.sh.j2') | indent(12) }} runcmd: - /usr/local/bin/set-ssh.sh @@ -443,9 +461,18 @@ - mount -a {% endif %} +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] %} + - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + - /usr/local/bin/configure_ucx_openmpi_env.sh + +{% endif %} + {% if hostvars['localhost']['ldms_support'] %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} + + - /usr/local/bin/setup_nvhpc_sdk.sh + - /usr/local/bin/export_nvhpc_env.sh - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 84440bbdec..1768bc8941 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -474,17 +474,14 @@ - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - mount -a - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." - # - echo "Shared NFS mount is available at: {{ client_mount_path }}" - /usr/local/bin/configure_ucx_openmpi_env.sh - # - echo "" - # - echo "IMPORTANT:" - # - echo "1. Install UCX and/or OpenMPI on the LOGIN / COMPILER node first." - # - echo "2. Ensure they are installed under the shared mount:" - # - echo " {{ client_mount_path }}/hpc_tools/benchmarks/" - # - echo "3. On this node, run the environment setup script when ready:" - # - echo "" - # - echo "This step is intentionally NOT run automatically." - - echo "==================================================" + +{% endif %} + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] %} + - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + - /usr/local/bin/configure_ucx_openmpi_env.sh + {% endif %} {% if hostvars['localhost']['ldms_support'] %} diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 index dfc30520b3..d0f788a986 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 @@ -11,7 +11,9 @@ export HOME=/root NVCOMPILERS="{{ nvhpc_local_mount | default('/opt/nvidia/nvhpc') }}" NVARCH="$(uname -s)_$(uname -m)" -NVHPC_VERSION="{{ nvhpc_version | default('25.11') }}" +NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}" +NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')" + NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION" PROFILE_FILE="/etc/profile.d/nvhpc.sh" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 index a0cfdfdbe8..1ff49968b4 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 @@ -5,7 +5,9 @@ CLIENT_MOUNT="{{ client_mount_path }}" NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" NVARCH="$(uname -s)_$(uname -m)" -NVHPC_VERSION="25.11" + +NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}" +NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')" NVHPC_BASE="$NVHPC_LOCAL_MOUNT/$NVARCH/$NVHPC_VERSION" PROFILE_FILE="/etc/profile.d/nvhpc.sh" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 index 75478a470e..8ff149fca3 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 @@ -5,7 +5,25 @@ LOGFILE="/var/log/nvhpc_sdk_install.log" echo "===== Starting NVIDIA HPC SDK installation =====" | tee -a "$LOGFILE" -NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('nvhpc_2025_2511_Linux_x86_64_cuda_13.0') }}" +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; + *) + echo "Unsupported architecture: ${sys_arch}" + exit 1 + ;; +esac + +NVHPC_VERSION="2025_2511" +NVHPC_SHORT_VERSION="$(echo ${NVHPC_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')" +CUDA_VERSION="13.0" + +NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('') }}" +if [ -z "${NVHPC_PKG_NAME}" ]; then + NVHPC_PKG_NAME="nvhpc_${NVHPC_VERSION}_Linux_${arch}_cuda_${CUDA_VERSION}" +fi + NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" NVHPC_MOUNT="/shared-nvhpc-sdk" NVHPC_TARBALL="$NVHPC_MOUNT/${NVHPC_PKG_NAME}.tar.gz" @@ -47,7 +65,7 @@ else fi mkdir -p "$NVHPC_INSTALL_DIR_NFS" -INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_x86_64/25.11/compilers/bin" +INSTALL_BIN_DIR="$NVHPC_INSTALL_DIR_NFS/Linux_${arch}/${NVHPC_SHORT_VERSION}/compilers/bin" if [ -x "$INSTALL_BIN_DIR/nvc" ]; then echo "[INFO] NVHPC already installed. Skipping installer." | tee -a "$LOGFILE" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 index 9adde78472..5758b20094 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 @@ -29,10 +29,20 @@ echo "===== OpenMPI build started =====" mkdir -p "$OPENMPI_BUILD" cd "$OPENMPI_BUILD" +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; + *) + echo "Unsupported architecture: ${sys_arch}" + exit 1 + ;; +esac + if [ ! -f openmpi.tar.gz ]; then echo "[INFO] Downloading OpenMPI source code..." wget --no-check-certificate \ - https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \ + https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/${arch}/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/openmpi/openmpi.tar.gz \ -O openmpi.tar.gz >> "$LOGFILE" 2>&1 echo "[INFO] OpenMPI download completed" else diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 index 0231d77683..55b7483c68 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_ucx.sh.j2 @@ -26,10 +26,20 @@ echo "===== UCX build started =====" mkdir -p "$UCX_BUILD" cd "$UCX_BUILD" +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; + *) + echo "Unsupported architecture: ${sys_arch}" + exit 1 + ;; +esac + if [ ! -f ucx.tar.gz ]; then echo "[INFO] Downloading UCX source code..." wget --no-check-certificate \ - https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/x86_64/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz \ + https://{{ hostvars['localhost']['admin_nic_ip'] }}:2225/pulp/content/opt/omnia/offline_repo/cluster/${arch}/{{ hostvars['localhost']['cluster_os_type'] }}/{{ hostvars['localhost']['cluster_os_version'] }}/tarball/ucx/ucx.tar.gz \ -O ucx.tar.gz >> "$LOGFILE" 2>&1 echo "[INFO] UCX download completed" else diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index f911ce975e..454661fc7e 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -158,5 +158,9 @@ parallel_copy_candidates: - name: nvhpc_sdk_x86_64 src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" + + - name: nvhpc_sdk_aarch64 + src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_aarch64_relpath | dirname }}/" + dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 2483775495..e1ba8926a2 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -32,7 +32,13 @@ {"package": "cuda-run", "type": "iso", "url": "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux_sbsa.run" + }, + { + "package": "nvhpc_2025_2511_Linux_aarch64_cuda_13.0", + "type": "tarball", + "url": "https://developer.download.nvidia.com/hpc-sdk/25.11/nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz" } + ] }, "login_node":{ From 9dbb5587a0945903830b07f70626651f691da23b Mon Sep 17 00:00:00 2001 From: mcas Date: Wed, 11 Feb 2026 12:03:04 +0530 Subject: [PATCH 121/172] file path name change --- discovery/roles/slurm_config/vars/main.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 454661fc7e..8ec9f5b2cb 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -131,8 +131,10 @@ offline_path_aarch64: ssh_private_key_path: /root/.ssh/oim_rsa # nvidia sdk vars -nvhpc_package_name: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" -nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_package_name }}/{{ nvhpc_package_name }}.tar.gz" +# Fully resolved tarball relative paths (no nested Jinja2) +nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz" +nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz" + nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" # parallel file copy From 8b7e4e7719d9dbb4e081fcb3bb5329dafa1ecb72 Mon Sep 17 00:00:00 2001 From: mcas Date: Wed, 11 Feb 2026 12:26:26 +0530 Subject: [PATCH 122/172] lint fix --- discovery/roles/slurm_config/vars/main.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 8ec9f5b2cb..981a113610 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -132,8 +132,13 @@ ssh_private_key_path: /root/.ssh/oim_rsa # nvidia sdk vars # Fully resolved tarball relative paths (no nested Jinja2) -nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz" -nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz" +nvhpc_tarball_x86_64_relpath: > + offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/ + nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz + +nvhpc_tarball_aarch64_relpath: > + offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/ + nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" @@ -160,9 +165,9 @@ parallel_copy_candidates: - name: nvhpc_sdk_x86_64 src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" - + - name: nvhpc_sdk_aarch64 src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_aarch64_relpath | dirname }}/" dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" -backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" +backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" \ No newline at end of file From fa7a1d17b0bf7ccab519c36092452bdda52cdf65 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 11 Feb 2026 13:19:53 +0530 Subject: [PATCH 123/172] Partiiton normal made default --- discovery/roles/slurm_config/defaults/main.yml | 1 - discovery/roles/slurm_config/vars/main.yml | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index ad7ab09058..955e4c2a37 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -67,7 +67,6 @@ __default_config: PartitionName: - PartitionName: DEFAULT Nodes: ALL - Default: true MaxTime: INFINITE State: UP # S_P_ARRAY type paramater to be provided this way diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 89166b1f12..43ee995e5a 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -125,6 +125,7 @@ partition_params: Nodes: "{{ cmpt_list | join(',') }}" MaxTime: "INFINITE" State: "UP" + Default: "YES" openldap_dir_name: "openldap/" software_config_file: "{{ input_project_dir }}/software_config.json" omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}" From 6a9642dba625314bde1d5554fd33787d5f9ffa62 Mon Sep 17 00:00:00 2001 From: mcas Date: Wed, 11 Feb 2026 13:32:28 +0530 Subject: [PATCH 124/172] commit --- .../templates/hpc_tools/install_openmpi.sh.j2 | 11 ----------- .../templates/hpc_tools/setup_nvhpc_sdk.sh.j2 | 4 ++-- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 index 5758b20094..9cd0d8d1a4 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_openmpi.sh.j2 @@ -105,22 +105,11 @@ EOF chmod 644 "$OPENMPI_ENV_FILE" -# Verify installation -echo "[INFO] Verifying OpenMPI installation..." -if [ -f "$OPENMPI_PREFIX/bin/ompi_info" ]; then - OPENMPI_VERSION=$("$OPENMPI_PREFIX/bin/ompi_info" --version | head -1) - echo "[SUCCESS] OpenMPI installation verified - Version: $OPENMPI_VERSION" | tee -a "$LOGFILE" -else - echo "[ERROR] OpenMPI installation verification failed - ompi_info not found" | tee -a "$LOGFILE" - exit 1 -fi # Create installation summary echo "" echo "===== OpenMPI Installation Summary =====" echo "Installation Status: SUCCESS" -echo "OpenMPI Version: $OPENMPI_VERSION" - echo "Integration Status:" if [ "$SLURM_FLAG" = "--with-slurm=yes --with-munge=/usr" ]; then echo " - Slurm Integration: ENABLED" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 index 8169d1f5a6..a101852ea2 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/setup_nvhpc_sdk.sh.j2 @@ -11,7 +11,7 @@ NVHPC_NFS_SHARE="$PARENT_MOUNT/nvhpc" NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" mkdir -p "$PARENT_MOUNT" -mkdir -p "$NVHPC_NFS_SHARE" + if ! mountpoint -q "$PARENT_MOUNT"; then mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT" @@ -24,7 +24,7 @@ fi echo "[INFO] Parent NVHPC export mounted" - +mkdir -p "$NVHPC_NFS_SHARE" # 3. Ensure fstab entry exists (bind mount, NOT NFS) if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT none bind,_netdev 0 0" >> /etc/fstab From 12b39befc3316b479c86bc676170674e12f63a99 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 11 Feb 2026 08:58:56 +0000 Subject: [PATCH 125/172] lint fix Signed-off-by: sakshi-singla-1735 --- discovery/roles/slurm_config/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index f7af9c09a4..8406816341 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -197,4 +197,4 @@ parallel_copy_candidates: src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_aarch64_relpath | dirname }}/" dest: "{{ slurm_config_path }}/hpc_tools/nvidia_sdk/" -backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" \ No newline at end of file +backup_dir: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/backup_{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" From 8c5d7291bc9038a13d71ef11f19d78de33f016f5 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 11 Feb 2026 14:35:50 +0530 Subject: [PATCH 126/172] Failed when flag fix --- discovery/roles/slurm_config/tasks/check_ctld_running.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 5af73f984c..0c7626f3dd 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -76,7 +76,7 @@ delegate_to: "{{ target_host }}" when: reachable_hosts | length > 0 ignore_unreachable: true - failed_when: true + failed_when: false - name: Trigger the scontrol reconfigure ansible.builtin.command: scontrol reconfigure From 9e076ab65db2ca1e74d7c0b3b754e9437cc108fd Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 11 Feb 2026 14:50:28 +0530 Subject: [PATCH 127/172] PartitionName validation key added --- .../input_validation/common_utils/slurm_conf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 0c98f64e6c..53516d2523 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -109,7 +109,7 @@ class SlurmParserEnum(str, Enum): slurm_partitionname_options = { - "Partition": S_P_STRING, + "PartitionName": S_P_STRING, "AllocNodes": S_P_CSV, "AllowAccounts": S_P_CSV, "AllowGroups": S_P_CSV, From 70d7ce92f8f97e422692514031267782bf4d3f7f Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 11 Feb 2026 15:11:13 +0530 Subject: [PATCH 128/172] Syntax error --- .../input_validation/common_utils/slurm_conf_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 53516d2523..3ea4d07a70 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -811,8 +811,7 @@ def validate_config_types(conf_dict, conf_name, module): elif expected_type == "array": if not isinstance(value, list): - error = f"Expected array (list), got { - type(value).__name__}" + error = f"Expected array (list), got {type(value).__name__}" elif value: if not all(isinstance(item, dict) for item in value): error = "Expected array of dicts, got mixed types" From 2a1b1907719db5da60d1db3bd9477f54f07fbf83 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 11 Feb 2026 09:52:54 +0000 Subject: [PATCH 129/172] ssh --- .../ci-group-login_compiler_node_aarch64.yaml.j2 | 8 ++++++++ .../cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index ca29315db6..dc2ddf9dcd 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -70,6 +70,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes + - path: /usr/local/bin/install_cuda_toolkit.sh permissions: '0755' content: | @@ -235,6 +242,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index c1a1b225c9..ffa228a769 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -72,6 +72,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa + IdentitiesOnly yes + - path: /usr/local/bin/install_nvidia_driver.sh permissions: '0755' content: | @@ -246,6 +253,7 @@ echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab From 66c160a18d030653f3377fac17ce068d9ac034b9 Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Wed, 11 Feb 2026 15:40:06 +0530 Subject: [PATCH 130/172] updating warning to fail when user_registry is not reachable Signed-off-by: Katakam-Rakesh --- local_repo/roles/validation/tasks/main.yml | 7 +++---- local_repo/roles/validation/vars/main.yml | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml index 41f584dd15..0f578af349 100644 --- a/local_repo/roles/validation/tasks/main.yml +++ b/local_repo/roles/validation/tasks/main.yml @@ -44,10 +44,9 @@ timeout: "{{ time_out }}" register: registry_check_result -- name: Warning - Display unreachable registries - ansible.builtin.pause: - prompt: "{{ registry_check_result.unreachable_registries | join(', ') }}\n{{ user_registry_msg }}" - seconds: "{{ warning_wait_time_warning }}" +- name: Fail - Unreachable registries detected + ansible.builtin.fail: + msg: "{{ unreachable_registries_fail_msg }}" when: - registry_check_result.unreachable_registries is defined - registry_check_result.unreachable_registries | length > 0 diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index 08a082ded7..87e8733498 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -145,6 +145,7 @@ user_registry_fail_msg: "Failed. Please ensure user_registry is non empty list a user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry should have 'host' and 'cert_path' keys defined" time_out: 30 user_registry_msg: "Above user registries is/are not reachable. Please make sure the user registry is accessible from the Omnia Infrastructure Manager." # noqa: yaml[line-length] +unreachable_registries_fail_msg: "Unreachable registries detected: {{ registry_check_result.unreachable_registries | join(', ') }}. {{ user_registry_msg }} Please check registry connectivity and configuration before proceeding." # noqa: yaml[line-length] cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in {{ project_input_path }}/local_repo_config.yml" # noqa: yaml[line-length] additional_packages_image_warning_msg: | WARNING: additional_packages.json contains packages of type 'image', but 'user_registry' is not defined in local_repo_config.yml. From 889213466485c6b69aa47949a5be5d4daa0688e0 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 11 Feb 2026 15:58:51 +0530 Subject: [PATCH 131/172] Added no_log whereever, StoragePass was displayed in log --- .../input_validation/common_utils/slurm_conf_utils.py | 2 -- .../roles/slurm_config/tasks/build_slurm_conf.yml | 6 ++++++ discovery/roles/slurm_config/tasks/confs.yml | 10 ++++++++++ .../slurm_config/tasks/extract_path_overrides.yml | 1 + .../roles/slurm_config/tasks/handle_extra_confs.yml | 2 ++ 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 3ea4d07a70..a8c50266a0 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -765,8 +765,6 @@ def validate_config_types(conf_dict, conf_name, module): current_conf = all_confs.get(conf_name, {}) if not current_conf: return {'invalid_keys': [], 'type_errors': []} - # module.fail_json(msg=f"Invalid configuration name: {conf_name}", conf_dict=conf_dict, current_conf=current_conf) - # module.warn(conf_name) invalid_keys = list( set(conf_dict.keys()).difference(set(current_conf.keys()))) type_errors = [] diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index cd72cf33f0..9d5d0f0944 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -18,6 +18,7 @@ | combine({'slurm': (apply_config['slurm'] | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + (node_params | default([]))}))}) }}" when: node_params is defined and node_params + no_log: true - name: Append login nodes to NodeName list ansible.builtin.set_fact: @@ -26,6 +27,7 @@ | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" loop: "{{ login_list }}" when: login_list is defined and login_list + no_log: true - name: Append compiler login nodes to NodeName list ansible.builtin.set_fact: @@ -34,6 +36,7 @@ | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" loop: "{{ compiler_login_list }}" when: compiler_login_list is defined and compiler_login_list + no_log: true - name: Append Partition ansible.builtin.set_fact: @@ -41,13 +44,16 @@ | combine({'slurm': (apply_config['slurm'] | combine({'PartitionName': (apply_config['slurm'].PartitionName | default([])) + [partition_params]}))}) }}" when: node_params is defined and node_params + no_log: true - name: Add gpu parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" when: gpu_params is defined and gpu_params + no_log: true - name: Add dbd parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}" when: dbd_list is defined and dbd_list + no_log: true diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 5e1e59376a..12236d6ed8 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -15,6 +15,7 @@ - name: Slurm dict ops ansible.builtin.set_fact: apply_config: "{{ __default_config }}" + no_log: true - name: Read NodeName parameters ansible.builtin.include_tasks: read_node_idrac.yml @@ -30,6 +31,7 @@ | combine({'slurmdbd': (apply_config['slurmdbd'] | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}" when: ctld_list + no_log: true - name: Check .conf files existence ansible.builtin.stat: @@ -46,6 +48,7 @@ delegate_to: localhost loop: "{{ configs_input | default({}) | dict2items }}" register: parsed_configs_input_results + no_log: true when: - configs_input is defined - configs_input @@ -56,6 +59,7 @@ ansible.builtin.set_fact: parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.item.key: item.conf_dict}) }}" loop: "{{ parsed_configs_input_results.results }}" + no_log: true when: - parsed_configs_input_results is defined - not item.skipped | default(false) @@ -64,6 +68,7 @@ ansible.builtin.set_fact: parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.key: item.value}) }}" loop: "{{ configs_input | default({}) | dict2items }}" + no_log: true when: - configs_input is defined - configs_input @@ -86,6 +91,7 @@ loop_control: loop_var: existing_conf_set register: prepared_conf_lists + no_log: true # All the updates to the confs follow after this point before merge - name: Prepend ClusterName and SlurmctldHost to slurm conf sources @@ -93,12 +99,14 @@ conf_merge_dict: "{{ conf_merge_dict | combine({'slurm': [{'ClusterName': cluster_name, 'AccountingStorageHost': dbd_list[0], 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}" when: "'slurm' in conf_merge_dict" + no_log: true - name: Slurm dbd - DbdHost and StorageHost ansible.builtin.set_fact: conf_merge_dict: "{{ conf_merge_dict | combine({'slurmdbd': [{'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}] + conf_merge_dict['slurmdbd']}) }}" when: "'slurmdbd' in conf_merge_dict" + no_log: true - name: Merge the confs slurm_conf: @@ -107,6 +115,7 @@ conf_name: "{{ item.key }}" loop: "{{ conf_merge_dict | dict2items }}" register: merged_conf + no_log: true - name: Update slurm_conf_dict with merged configuration for cloud_init read. # TODO: Remove cloud init dependency ansible.builtin.set_fact: @@ -169,6 +178,7 @@ remote_src: "{{ copy_from_oim }}" loop: "{{ merged_conf.results }}" register: ctld_conf_files + no_log: true when: - item.ini_lines diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml index ab1bf17aa6..0efcf18962 100644 --- a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml +++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml @@ -24,6 +24,7 @@ ansible.builtin.set_fact: slurmdbd_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurmdbd') | first).conf_dict }}" when: "'slurmdbd' in conf_merge_dict" + no_log: true - name: Extract cgroup.conf merged dict ansible.builtin.set_fact: diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml index 307ca01723..544822ec28 100644 --- a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml +++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml @@ -19,6 +19,7 @@ conf_name: "{{ extra_conf }}" register: ex_conf delegate_to: localhost + no_log: true when: - "'.' not in extra_conf" @@ -30,6 +31,7 @@ owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" + no_log: true when: - "'.' not in extra_conf" - ex_conf is success From ca14a61c68fd71392a872b994c2ad00a88214518 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Wed, 11 Feb 2026 17:46:22 +0530 Subject: [PATCH 132/172] Update omnia.sh --- omnia.sh | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 320 insertions(+), 11 deletions(-) diff --git a/omnia.sh b/omnia.sh index 235cc1dbc1..9c46a04dc9 100755 --- a/omnia.sh +++ b/omnia.sh @@ -979,10 +979,11 @@ start_container_session() { } show_help() { - echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]" + echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]" echo " -i, --install Install and start the Omnia core container" echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" - echo " --upgrade Upgrade the Omnia core container from image tag 1.0 to 1.1" + echo " --upgrade Upgrade the Omnia core container to newer version + echo " --rollback Rollback the Omnia core container to previous version echo " -v, --version Display Omnia version information" echo " -h, --help More information about usage" } @@ -1248,15 +1249,6 @@ phase1_validate() { return 1 fi - if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then - echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image" - return 1 - fi - - echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)" - - - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." @@ -1372,6 +1364,9 @@ phase4_container_swap() { if [ ! -f "$quadlet_file" ]; then echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi @@ -1385,27 +1380,42 @@ phase4_container_swap() { if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit" if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi systemctl daemon-reload || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 } systemctl start omnia_core.service || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 } @@ -1419,6 +1429,9 @@ phase4_container_swap() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi @@ -1436,6 +1449,9 @@ phase4_container_swap() { fi "; then echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi @@ -1490,6 +1506,296 @@ upgrade_omnia_core() { exit 0 } +# Validate backup directory structure and files +validate_backup_directory() { + local backup_path="$1" + + echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path" + + # Check if backup directory exists + if ! podman exec -u root omnia_core test -d "$backup_path"; then + echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path" + return 1 + fi + + # Check for required subdirectories + for subdir in input metadata configs; do + if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then + echo "[ERROR] [ROLLBACK] Missing required subdirectory: $backup_path/$subdir" + return 1 + fi + done + + # Check for required files + if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then + echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml" + return 1 + fi + + if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then + echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container" + return 1 + fi + + # Verify metadata contains version information + if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then + echo "[ERROR] [ROLLBACK] Metadata file does not contain version information" + return 1 + fi + + echo "[INFO] [ROLLBACK] Backup validation successful" + return 0 +} + +# Stop container gracefully with timeout +stop_container_gracefully() { + local container_name="$1" + local timeout="${2:-30}" + + echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..." + + # Try graceful stop first + if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then + echo "[INFO] [ROLLBACK] Container stopped gracefully" + return 0 + fi + + # Check if container is still running + if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then + echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..." + if podman stop "$container_name" >/dev/null 2>&1; then + echo "[INFO] [ROLLBACK] Container force stopped" + return 0 + else + echo "[ERROR] [ROLLBACK] Failed to stop container" + return 1 + fi + fi + + return 0 +} + +# Restore files from backup +restore_from_backup() { + local backup_path="$1" + + echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path" + + # Restore input files + if ! podman exec -u root omnia_core bash -c " + set -e + rm -rf /opt/omnia/input + cp -a '$backup_path/input' /opt/omnia/ + "; then + echo "[ERROR] [ROLLBACK] Failed to restore input files" + return 1 + fi + + # Restore metadata + if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then + echo "[ERROR] [ROLLBACK] Failed to restore metadata" + return 1 + fi + + # Restore container config on host + if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then + echo "[ERROR] [ROLLBACK] Failed to restore container config" + return 1 + fi + + echo "[INFO] [ROLLBACK] Files restored successfully" + return 0 +} + +# Main rollback function +rollback_omnia_core() { + echo -e "${GREEN}================================================================================${NC}" + echo -e "${GREEN} OMNIA CORE ROLLBACK${NC}" + echo -e "${GREEN}================================================================================${NC}" + echo "" + + # Audit log start + local rollback_start=$(date -Iseconds) + echo "[AUDIT] Rollback operation started at: $rollback_start" + + # Check if omnia_core container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo -e "${RED}ERROR: Omnia core container is not running.${NC}" + exit 1 + fi + + # Get current version + if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then + echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}" + exit 1 + fi + + local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + if [ "$current_version" != "2.1.0.0" ]; then + echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}" + exit 1 + fi + + # List available backups + echo "[INFO] [ROLLBACK] Scanning for available backups..." + local backup_dirs=() + while IFS= read -r line; do + backup_dirs+=("$line") + done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r) + + if [ ${#backup_dirs[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No backup directories found.${NC}" + exit 1 + fi + + echo "" + echo "Available backup versions:" + for i in "${!backup_dirs[@]}"; do + local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//') + local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) + echo " $((i+1)). Version $version (created: $backup_date)" + done + + # Prompt for backup selection + echo "" + echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " + read -r selection + + # Validate selection + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then + echo -e "${RED}ERROR: Invalid selection.${NC}" + exit 1 + fi + + local selected_backup="${backup_dirs[$((selection-1))]}" + local backup_version=$(basename "$selected_backup" | sed 's/version_//') + + echo "" + echo "Selected backup: Version $backup_version" + echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: " + read -r confirm + + if [[ ! "$confirm" =~ ^[yY] ]]; then + echo "Rollback cancelled by user." + exit 0 + fi + + # Validate selected backup - only check if directory exists without podman exec + if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then + # Try to check on host if container check fails + # Get shared path from metadata to check on host + local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + local host_backup_path="${selected_backup#/opt/omnia}" + if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then + echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}" + exit 1 + fi + fi + + echo "" + echo "[INFO] [ROLLBACK] Starting rollback process..." + + # Step 1: Stop 1.1 container gracefully + echo "" + echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..." + if ! stop_container_gracefully "omnia_core" 30; then + echo -e "${RED}ERROR: Failed to stop container.${NC}" + exit 1 + fi + + # Step 2: Check for 1.0 image + echo "" + echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..." + if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then + echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}" + echo -e "${YELLOW}Attempting to tag image...${NC}" + + # Try to tag latest as 1.0 if available + if podman inspect omnia_core:latest >/dev/null 2>&1; then + podman tag omnia_core:latest omnia_core:1.0 + else + echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}" + exit 1 + fi + fi + + # Step 3: Start 1.0 container + echo "" + echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..." + systemctl daemon-reload + if ! systemctl start omnia_core.service; then + echo -e "${RED}ERROR: Failed to start container service.${NC}" + exit 1 + fi + + # Step 4: Wait for container to be healthy + echo "" + echo "[INFO] [ROLLBACK] Step 4: Waiting for container to be healthy..." + local health_timeout=60 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ROLLBACK] Container is healthy" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}" + exit 1 + fi + + # Step 5: Validate backup directory structure + echo "" + echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..." + if ! validate_backup_directory "$selected_backup"; then + echo -e "${RED}ERROR: Backup validation failed.${NC}" + exit 1 + fi + + # Step 6: Restore files from backup + echo "" + echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..." + if ! restore_from_backup "$selected_backup"; then + echo -e "${RED}ERROR: Failed to restore from backup.${NC}" + exit 1 + fi + + # Step 7: Verify container version + echo "" + echo "[INFO] [ROLLBACK] Step 7: Verifying container version..." + local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + + if [ "$verify_version" != "$backup_version" ]; then + echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}" + exit 1 + fi + + # Audit log end + local rollback_end=$(date -Iseconds) + echo "[AUDIT] Rollback operation completed at: $rollback_end" + echo "[AUDIT] Rolled back from version $current_version to $backup_version" + + echo "" + echo -e "${GREEN}================================================================================${NC}" + echo -e "${GREEN} ROLLBACK COMPLETED SUCCESSFULLY${NC}" + echo -e "${GREEN}================================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}" + echo -e "${GREEN}✓ Container is running and healthy${NC}" + echo -e "${GREEN}✓ Configuration restored from backup${NC}" + echo "" + + # Initialize SSH config and start container session + init_ssh_config + start_container_session +} + # Main function to check if omnia_core container is already running. # If yes, ask the user if they want to enter the container or reinstall. # If no, set it up. @@ -1504,6 +1810,9 @@ main() { --upgrade) upgrade_omnia_core ;; + --rollback) + rollback_omnia_core + ;; --version|-v) display_version ;; From 46c63c095c51a3f2df5097a3b9739e61e7b8b6ad Mon Sep 17 00:00:00 2001 From: pullan1 Date: Wed, 11 Feb 2026 18:06:48 +0530 Subject: [PATCH 133/172] cleanup of files under offline_repo dir during pulp cleanup Signed-off-by: pullan1 --- common/library/modules/pulp_cleanup.py | 104 ++++++++++++++++++++++--- local_repo/pulp_cleanup.yml | 2 + 2 files changed, 95 insertions(+), 11 deletions(-) diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index 00ed27d0dd..f3da3e2004 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -27,6 +27,7 @@ import csv import glob import json +import shutil import subprocess from typing import Dict, List, Any, Tuple @@ -399,7 +400,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) return False, f"Pulp deletion error: {str(e)}" -def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: +def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: """Cleanup a pip module from Pulp Python repository. Pip modules are stored as: pip_module== @@ -408,6 +409,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""} messages = [] pulp_deleted = False + content_removed = False try: # Pulp Python repo name format: pip_module @@ -467,11 +469,17 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: messages.append("Status files updated") mark_software_partial(affected, base_path, logger, 'pip_module') - if pulp_deleted: + # Clean up uploaded content from filesystem + fs_result = cleanup_content_directory(name, 'pip_module', repo_store_path, logger) + if fs_result["status"] == "Success": + content_removed = True + messages.append(fs_result["message"]) + + if pulp_deleted or content_removed: result["status"] = "Success" result["message"] = "; ".join(messages) if messages else "Cleaned up" else: - result["message"] = f"pip_module '{name}' not found in Pulp" + result["message"] = f"pip_module '{name}' not found in Pulp or filesystem" except Exception as e: result["message"] = f"Error: {str(e)}" @@ -493,7 +501,7 @@ def get_pulp_file_repo_name(name: str, file_type: str) -> str: return name -def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -> Dict[str, Any]: +def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: """Cleanup artifact from Pulp File repository. Handles: tarball, git, manifest, ansible_galaxy_collection @@ -503,6 +511,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - messages = [] pulp_deleted = False status_removed = False + content_removed = False try: # Get the expected Pulp repository name @@ -559,12 +568,18 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - messages.append("Status files updated") mark_software_partial(affected, base_path, logger, file_type) + # Clean up uploaded content from filesystem + fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger) + if fs_result["status"] == "Success": + content_removed = True + messages.append(fs_result["message"]) + # Determine overall result - if pulp_deleted or status_removed: + if pulp_deleted or status_removed or content_removed: result["status"] = "Success" result["message"] = "; ".join(messages) if messages else "Cleaned up" else: - result["message"] = f"{file_type} '{name}' not found in Pulp or status files" + result["message"] = f"{file_type} '{name}' not found in Pulp, status files, or filesystem" except Exception as e: result["message"] = f"Error: {str(e)}" @@ -572,7 +587,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - return result -def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]: +def cleanup_file(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: """Cleanup a file artifact. Routes to appropriate handler: @@ -583,10 +598,75 @@ def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]: # Handle pip modules separately - they use Python repositories if file_type == "pip_module": - return cleanup_pip_module(name, base_path, logger) + return cleanup_pip_module(name, base_path, repo_store_path, logger) # All other file types use Pulp File repository - return cleanup_file_repository(name, file_type, base_path, logger) + return cleanup_file_repository(name, file_type, base_path, repo_store_path, logger) + + +# ============================================================================= +# FILESYSTEM CONTENT CLEANUP +# ============================================================================= + +def cleanup_content_directory(content_name: str, content_type: str, repo_store_path: str, logger) -> Dict[str, Any]: + """Remove uploaded content directory from the filesystem. + + Builds the content path the same way as download_common.py: + /offline_repo/cluster//rhel/// + + This mirrors how remove_from_status_files iterates over ARCH_SUFFIXES to + clean status.csv entries. + + Args: + content_name: Name of the content item (e.g., 'helm-v3.19.0-amd64') + content_type: Directory category (tarball, git, pip_module, manifest, + ansible_galaxy_collection, rpm_file) + repo_store_path: Root store path (e.g., '/opt/omnia') + logger: Logger instance + + Returns: + Dict with name, type, status, and message keys + """ + result = {"name": content_name, "type": f"filesystem_{content_type}", + "status": "Failed", "message": ""} + removed_dirs = [] + + cluster_path = os.path.join(repo_store_path, "offline_repo", "cluster") + if not os.path.exists(cluster_path): + result["message"] = f"Content store path not found: {cluster_path}" + logger.warning(result["message"]) + return result + + try: + for arch in ARCH_SUFFIXES: + # Walk version directories (e.g., rhel/10.0) + arch_path = os.path.join(cluster_path, arch) + if not os.path.isdir(arch_path): + continue + + for version_dir in glob.glob(f"{arch_path}/rhel/*/"): + content_dir = os.path.join(version_dir, content_type, content_name) + if os.path.exists(content_dir): + logger.info(f"Removing content directory: {content_dir}") + if os.path.isdir(content_dir): + shutil.rmtree(content_dir) + else: + os.remove(content_dir) + removed_dirs.append(content_dir) + + if removed_dirs: + result["status"] = "Success" + result["message"] = f"Removed content: {', '.join(removed_dirs)}" + else: + result["message"] = (f"No filesystem content found for " + f"'{content_name}' under {content_type}") + logger.info(result["message"]) + + except Exception as e: + result["message"] = f"Filesystem cleanup error: {str(e)}" + logger.error(f"Failed to cleanup content {content_name}: {e}") + + return result # ============================================================================= @@ -868,7 +948,8 @@ def run_module(): cleanup_repos=dict(type='list', elements='str', default=[]), cleanup_containers=dict(type='list', elements='str', default=[]), cleanup_files=dict(type='list', elements='str', default=[]), - base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT) + base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT), + repo_store_path=dict(type='str', default='/opt/omnia') ), supports_check_mode=True ) @@ -877,6 +958,7 @@ def run_module(): cleanup_containers = module.params['cleanup_containers'] cleanup_files = module.params['cleanup_files'] base_path = module.params['base_path'] + repo_store_path = module.params['repo_store_path'] # Setup logger - setup_standard_logger expects a directory, creates standard.log inside log_dir = os.path.join(base_path, "cleanup") @@ -915,7 +997,7 @@ def run_module(): # Process files for file in cleanup_files: - result = cleanup_file(file, base_path, logger) + result = cleanup_file(file, base_path, repo_store_path, logger) all_results.append(result) logger.info(f"File {file}: {result['status']} - {result['message']}") diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index 5d409bbc1f..93e379833b 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -77,6 +77,8 @@ cleanup_repos: "{{ repo_list | default([]) }}" cleanup_containers: "{{ container_list | default([]) }}" cleanup_files: "{{ file_list | default([]) }}" + base_path: "{{ base_path | default('/opt/omnia/log/local_repo') }}" + repo_store_path: "{{ repo_store_path | default('/opt/omnia') }}" register: cleanup_result post_tasks: From e4441cf2cf085afd1a866a10779205cc39ca7795 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 11 Feb 2026 12:37:33 +0000 Subject: [PATCH 134/172] making version and name dynamic --- .../hpc_tools/configure_nvhpc_env.sh.j2 | 18 +++++++++++++++--- .../templates/hpc_tools/export_nvhpc_env.sh.j2 | 17 ++++++++++++++--- .../hpc_tools/install_nvhpc_sdk.sh.j2 | 15 ++++++++------- discovery/roles/slurm_config/vars/main.yml | 11 +++++------ 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 index d0f788a986..958ac6e27c 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/configure_nvhpc_env.sh.j2 @@ -9,10 +9,22 @@ echo "===== Configuring NVIDIA HPC SDK environment =====" # Cloud-init safe defaults export HOME=/root -NVCOMPILERS="{{ nvhpc_local_mount | default('/opt/nvidia/nvhpc') }}" +NVCOMPILERS="/opt/nvidia/nvhpc" NVARCH="$(uname -s)_$(uname -m)" -NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}" -NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')" +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; +esac + +# Select package name based on detected architecture (rendered from slurm_config vars) +case "${arch}" in + x86_64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_x86_64 }}" ;; + aarch64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_aarch64 }}" ;; +esac + +# Derive version from package name +NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/' | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/') NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 index 1ff49968b4..db2a35df60 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/export_nvhpc_env.sh.j2 @@ -5,9 +5,20 @@ CLIENT_MOUNT="{{ client_mount_path }}" NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc" NVARCH="$(uname -s)_$(uname -m)" - -NVHPC_LONG_VERSION="{{ nvhpc_version_long | default('2025_2511') }}" -NVHPC_VERSION="$(echo ${NVHPC_LONG_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')" +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; +esac + +# Select package name based on detected architecture (rendered from slurm_config vars) +case "${arch}" in + x86_64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_x86_64 }}" ;; + aarch64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_aarch64 }}" ;; +esac + +# Derive version from package name +NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/' | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/') NVHPC_BASE="$NVHPC_LOCAL_MOUNT/$NVARCH/$NVHPC_VERSION" PROFILE_FILE="/etc/profile.d/nvhpc.sh" diff --git a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 index 8ff149fca3..dd6a55f3ea 100644 --- a/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 +++ b/discovery/roles/configure_ochami/templates/hpc_tools/install_nvhpc_sdk.sh.j2 @@ -15,14 +15,15 @@ case "${sys_arch}" in ;; esac -NVHPC_VERSION="2025_2511" -NVHPC_SHORT_VERSION="$(echo ${NVHPC_VERSION} | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/')" -CUDA_VERSION="13.0" +# Select package name based on detected architecture (rendered from slurm_config vars) +case "${arch}" in + x86_64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_x86_64 }}" ;; + aarch64) NVHPC_PKG_NAME="{{ nvhpc_pkg_name_aarch64 }}" ;; +esac -NVHPC_PKG_NAME="{{ nvhpc_pkg_name | default('') }}" -if [ -z "${NVHPC_PKG_NAME}" ]; then - NVHPC_PKG_NAME="nvhpc_${NVHPC_VERSION}_Linux_${arch}_cuda_${CUDA_VERSION}" -fi +# Derive version from package name: nvhpc_YYYY_YYMM_Linux__cuda_X.Y +NVHPC_VERSION=$(echo "$NVHPC_PKG_NAME" | sed 's/nvhpc_\([0-9]*_[0-9]*\)_Linux_.*/\1/') +NVHPC_SHORT_VERSION=$(echo "$NVHPC_VERSION" | cut -d'_' -f2 | sed 's/\(..\)\(..\)/\1.\2/') NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk" NVHPC_MOUNT="/shared-nvhpc-sdk" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 8406816341..22c0d7b11b 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -159,13 +159,12 @@ ssh_private_key_path: /root/.ssh/oim_rsa # nvidia sdk vars # Fully resolved tarball relative paths (no nested Jinja2) -nvhpc_tarball_x86_64_relpath: > - offline_repo/cluster/x86_64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_x86_64_cuda_13.0/ - nvhpc_2025_2511_Linux_x86_64_cuda_13.0.tar.gz +# nvidia sdk vars +nvhpc_pkg_name_x86_64: "nvhpc_2025_2511_Linux_x86_64_cuda_13.0" +nvhpc_pkg_name_aarch64: "nvhpc_2025_2511_Linux_aarch64_cuda_13.0" -nvhpc_tarball_aarch64_relpath: > - offline_repo/cluster/aarch64/rhel/10.0/tarball/nvhpc_2025_2511_Linux_aarch64_cuda_13.0/ - nvhpc_2025_2511_Linux_aarch64_cuda_13.0.tar.gz +nvhpc_tarball_x86_64_relpath: "offline_repo/cluster/x86_64/rhel/10.0/tarball/{{ nvhpc_pkg_name_x86_64 }}/{{ nvhpc_pkg_name_x86_64 }}.tar.gz" +nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/10.0/tarball/{{ nvhpc_pkg_name_aarch64 }}/{{ nvhpc_pkg_name_aarch64 }}.tar.gz" nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" From e005855e702316bc1942848e4dd7dc8a600b1437 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 11 Feb 2026 12:39:51 +0000 Subject: [PATCH 135/172] adding the min and sec --- common/library/module_utils/local_repo/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 6196e8c7e6..6997233fe6 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -102,10 +102,10 @@ } CLI_FILE_PATH = "/root/.config/pulp/cli.toml" -POST_TIMEOUT = 3600 -TAR_POLL_VAL = 25 -FILE_POLL_VAL = 1 -ISO_POLL_VAL = 15 +POST_TIMEOUT = 3600 # seconds +TAR_POLL_VAL = 25 # minutes +FILE_POLL_VAL = 1 # minutes +ISO_POLL_VAL = 15 # minutes FILE_URI = "/pulp/api/v3/content/file/files/" PULP_SSL_CA_CERT = "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" # ---------------------------- From 1af974feb43bc62f3427ebabdfd0ae2bda959180 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 11 Feb 2026 18:16:15 +0530 Subject: [PATCH 136/172] Update ci-group-slurm_node_x86_64.yaml.j2 Signed-off-by: sakshi-singla-1735 --- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 88eeb45e16..7ee17aa10d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -494,8 +494,8 @@ - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab - mount -a - - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." - - /usr/local/bin/configure_ucx_openmpi_env.sh + # - echo "One or more shared components (UCX / OpenMPI / LDMS) are enabled." + # - /usr/local/bin/configure_ucx_openmpi_env.sh {% endif %} From 5be8766e5f45816a78add59a81db98a8e0704746 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Thu, 12 Feb 2026 05:08:14 +0000 Subject: [PATCH 137/172] slurmdbd innodb fix --- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 4 +++- .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 1 + .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 1 + discovery/roles/slurm_config/vars/main.yml | 4 ++-- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 49d2f635e9..4d8aa716cd 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -340,7 +340,9 @@ chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/mariadb chown -R {{ slurm_user }}:{{ slurm_user }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? - chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb + chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb /var/log/slurm + #firewall systemctl enable firewalld systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 3dc8f65514..7931fc70c4 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -108,6 +108,7 @@ bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else echo "[ERROR] NVIDIA driver installation failed." fi diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 62a4e9e063..04cec708e1 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -116,6 +116,7 @@ bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else echo "[ERROR] NVIDIA driver installation failed." fi diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 43ee995e5a..3afa329923 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -66,8 +66,8 @@ gpu_slurm_conf: SelectType: select/cons_tres SelectTypeParameters: CR_Core_Memory SlurmdParameters: l3cache_as_socket -innodb_buffer_pool_size: 1G -innodb_lock_wait_timeout: 120 +innodb_buffer_pool_size: 4G +innodb_lock_wait_timeout: 900 # TODO tmp nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" bmc_username: "{{ hostvars['localhost']['bmc_username'] }}" From 4a31bc3d2e2217d049643f9cef1b1286b4caada7 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Thu, 12 Feb 2026 05:32:29 +0000 Subject: [PATCH 138/172] variablize the dir --- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 4d8aa716cd..b523eeb297 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -340,8 +340,8 @@ chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/mariadb chown -R {{ slurm_user }}:{{ slurm_user }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb /var/log/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} + chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} #firewall systemctl enable firewalld From 7ef0c3153135cfdd1d82b59f09ceb9bcc30da584 Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Thu, 12 Feb 2026 11:44:15 +0530 Subject: [PATCH 139/172] removing doca-ofed from nfs share Signed-off-by: Katakam-Rakesh --- .../templates/doca-ofed/doca-install.sh.j2 | 3 --- discovery/roles/k8s_config/vars/main.yml | 13 ++----------- discovery/roles/slurm_config/vars/main.yml | 12 ++---------- 3 files changed, 4 insertions(+), 24 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 index 111abcb3a1..db8a7cb9cc 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -44,9 +44,6 @@ else dnf install -y kernel-headers-$(uname -r) fi -echo "Bootstrap doca-ofed package..." -rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm" - echo "Installing doca-ofed..." if rpm -q doca-ofed >/dev/null 2>&1; then echo "doca-ofed package is already installed." diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index 433b8e9f76..a80fb9b257 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -78,19 +78,10 @@ packages_base_dir_aarch64: "{{ k8s_client_mount_path }}/packages/aarch64" offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" packages_layout_x86_64: - - doca-ofed - cuda packages_layout_aarch64: - - doca-ofed - cuda print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" -offline_path_x86_64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" -offline_path_aarch64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" - +offline_path_x86_64: [] +offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 43ee995e5a..3616b55068 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -141,19 +141,11 @@ packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64" offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" packages_layout_x86_64: - - doca-ofed - cuda packages_layout_aarch64: - - doca-ofed - cuda print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" -offline_path_x86_64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" -offline_path_aarch64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" +offline_path_x86_64: [] +offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa From 346667afa1e87aafb623bc66fcf0d41b3959d67a Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Thu, 12 Feb 2026 06:26:30 +0000 Subject: [PATCH 140/172] slurmdbd restart in controller --- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index b523eeb297..d5f9ef9ba6 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -558,4 +558,6 @@ - /root/ldms_sampler.sh {% endif %} + - systemctl restart slurmdbd + - systemctl restart slurmctld - echo "Cloud-Init has completed successfully." From b4f064ee0d7feed5bf0b3bd6233e992a5bd133e1 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 15:52:26 +0530 Subject: [PATCH 141/172] Upgrade of input credential files to 2.1 --- .../tasks/display_warnings.yml | 53 ++++++ .../import_input_parameters/tasks/main.yml | 12 ++ .../restore_omnia_config_credentials.yml | 171 ++++++++++++++++++ .../restore_user_registry_credential.yml | 130 +++++++++++++ .../tasks/set_backup_location.yml | 33 ++++ .../templates/omnia_config_credentials.yml.j2 | 48 +++++ .../import_input_parameters/vars/main.yml | 66 ++++++- 7 files changed, 512 insertions(+), 1 deletion(-) create mode 100644 upgrade/roles/import_input_parameters/tasks/display_warnings.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/set_backup_location.yml create mode 100644 upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml new file mode 100644 index 0000000000..ac1eb69998 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -0,0 +1,53 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Display collected warnings + ansible.builtin.debug: + msg: | + ================================= + UPGRADE WARNINGS SUMMARY + ================================= + + {% if upgrade_warnings | length > 0 %} + {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected. + You will now be shown the detailed list. + {% else %} + No warnings detected. Upgrade completed successfully! + {% endif %} + when: upgrade_warnings is defined + + +- name: Pause for user to review warnings + ansible.builtin.pause: + prompt: | + ╔════════════════════════════════════════════╗ + ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ + ╚════════════════════════════════════════════╝ + + {% if upgrade_warnings | length > 0 %} + {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected: + + {% for warning in upgrade_warnings %} + {{ loop.index }}. {{ warning }} + {% endfor %} + + Please review these warnings carefully. + Press ENTER to continue or CTRL+C to abort. + {% else %} + No warnings detected. Upgrade completed successfully! + + Press ENTER to continue... + {% endif %} + when: upgrade_warnings is defined diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index ff77cf2c0e..2aacba7451 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Set backup location based on oim_metadata.yml + ansible.builtin.include_tasks: set_backup_location.yml + - name: Validate backup location for upgrade input processing ansible.builtin.include_tasks: precheck_backup_location.yml @@ -39,3 +42,12 @@ - name: Restore input files from backup ansible.builtin.include_tasks: restore_input_files.yml + +- name: Restore user_registry_credential.yml from backup + ansible.builtin.include_tasks: restore_user_registry_credential.yml + +- name: Restore omnia_config_credentials.yml from backup + ansible.builtin.include_tasks: restore_omnia_config_credentials.yml + +- name: Display upgrade warnings summary + ansible.builtin.include_tasks: display_warnings.yml diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml new file mode 100644 index 0000000000..0abafee26b --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -0,0 +1,171 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup omnia_config_credentials.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_stat + +- name: Check if backup omnia_config_credentials_key exists + ansible.builtin.stat: + path: "{{ backup_location }}/.omnia_config_credentials_key" + register: backup_omnia_config_credentials_key_stat + +- name: Add warning for missing omnia_config_credentials.yml to list + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }} + when: + - not backup_omnia_config_credentials_stat.stat.exists + - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))" + +- name: Process omnia_config_credentials.yml when present in backup + block: + - name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_content + changed_when: false + failed_when: false + no_log: true + + - name: "Case 1: Key present and file encrypted - Process and update" + block: + - name: Copy encrypted omnia_config_credentials.yml from backup to temp location + ansible.builtin.copy: + src: "{{ backup_location }}/omnia_config_credentials.yml" + dest: "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" + mode: '0600' + remote_src: true + + - name: Copy omnia_config_credentials_key from backup + ansible.builtin.copy: + src: "{{ backup_location }}/.omnia_config_credentials_key" + dest: "{{ input_project_dir }}/.omnia_config_credentials_key" + mode: '0600' + remote_src: true + + - name: Decrypt omnia_config_credentials.yml using the key + ansible.builtin.shell: + cmd: | + ansible-vault decrypt "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" \ + --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \ + --output "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + args: + executable: /bin/bash + no_log: true + register: vault_decrypt_result + failed_when: vault_decrypt_result.rc != 0 + + - name: Read decrypted content + ansible.builtin.slurp: + src: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + register: decrypted_content + no_log: true + + - name: Parse YAML content and extract credentials + ansible.builtin.set_fact: + credentials_dict: >- + {{ decrypted_content.content | b64decode | from_yaml }} + no_log: true + + rescue: + - name: Fail with decryption error message + ansible.builtin.fail: + msg: "{{ msg_omnia_config_decrypt_error }}" + + - name: "Case 1.1: Apply template and encrypt" + block: + - name: Set template variables from credentials + ansible.builtin.set_fact: + provision_password: "{{ credentials_dict.provision_password | default('') }}" + bmc_username: "{{ credentials_dict.bmc_username | default('') }}" + bmc_password: "{{ credentials_dict.bmc_password | default('') }}" + minio_s3_password: "{{ credentials_dict.minio_s3_password | default('') }}" + pulp_password: "{{ credentials_dict.pulp_password | default('') }}" + docker_username: "{{ credentials_dict.docker_username | default('') }}" + docker_password: "{{ credentials_dict.docker_password | default('') }}" + slurm_db_password: "{{ credentials_dict.slurm_db_password | default('') }}" + openldap_db_username: "{{ credentials_dict.openldap_db_username | default('') }}" + openldap_db_password: "{{ credentials_dict.openldap_db_password | default('') }}" + mysqldb_user: "{{ credentials_dict.mysqldb_user | default('') }}" + mysqldb_password: "{{ credentials_dict.mysqldb_password | default('') }}" + mysqldb_root_password: "{{ credentials_dict.mysqldb_root_password | default('') }}" + csi_username: "{{ credentials_dict.csi_username | default('') }}" + csi_password: "{{ credentials_dict.csi_password | default('') }}" + ldms_sampler_password: "{{ credentials_dict.ldms_sampler_password | default('') }}" + no_log: true + + - name: Write updated content using template + ansible.builtin.template: + src: omnia_config_credentials.yml.j2 + dest: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + mode: '0600' + no_log: true + + - name: Encrypt updated file using the same key + ansible.builtin.shell: + cmd: | + ansible-vault encrypt "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" \ + --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \ + --output "{{ input_project_dir }}/omnia_config_credentials.yml" + args: + executable: /bin/bash + no_log: true + register: vault_encrypt_result + failed_when: vault_encrypt_result.rc != 0 + + - name: Clean up temporary files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" + - "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + + - name: Display success message + ansible.builtin.debug: + msg: "{{ msg_omnia_config_credentials_success }}" + + rescue: + - name: Fail with template/encryption error message + ansible.builtin.fail: + msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}" + when: >- + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout + + - name: "Case 2: Both key and file missing - Add info warning" + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }} + when: >- + not backup_omnia_config_credentials_key_stat.stat.exists and + (backup_omnia_config_credentials_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and + "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))" + + - name: "Case 3: Error - Mismatched state" + ansible.builtin.fail: + msg: "{{ msg_omnia_config_credentials_error }}" + when: >- + (not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or + (backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) + when: backup_omnia_config_credentials_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml new file mode 100644 index 0000000000..de337310b8 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -0,0 +1,130 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup user_registry_credential.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_credential_stat + +- name: Check if user_registry_credential.yml exists in current directory + ansible.builtin.stat: + path: "{{ input_project_dir }}/user_registry_credential.yml" + register: user_registry_credential_stat + +- name: Check if backup local_repo_credentials_key exists + ansible.builtin.stat: + path: "{{ backup_location }}/.local_repo_credentials_key" + register: backup_local_repo_credentials_key_stat + +- name: Add warning for missing user_registry_credential.yml to list + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [ + "WARNING: user_registry_credential.yml not found in backup at " + + backup_location + "/user_registry_credential.yml. " + + "This might be due to complete Omnia execution not being completed. " + + "Skipping restoration of this file." + ] }} + when: + - not backup_user_registry_credential_stat.stat.exists + - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))" + +- name: Process user_registry_credential.yml when present in backup + block: + - name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_content + changed_when: false + failed_when: false + no_log: true + + - name: "Case 1: Key present and file encrypted - Copy both" + block: + - name: Decrypt user_registry_credential.yml using the key + ansible.builtin.shell: + cmd: | + ansible-vault decrypt "{{ input_project_dir }}/user_registry_credential.yml.tmp" \ + --vault-password-file "{{ input_project_dir }}/.local_repo_credentials_key" \ + --output "{{ input_project_dir }}/user_registry_credential.yml.decrypted" + args: + executable: /bin/bash + no_log: true + register: vault_decrypt_result + failed_when: vault_decrypt_result.rc != 0 + + - name: Copy encrypted user_registry_credential.yml from backup + ansible.builtin.copy: + src: "{{ backup_location }}/user_registry_credential.yml" + dest: "{{ input_project_dir }}/user_registry_credential.yml" + mode: '0600' + remote_src: true + + - name: Copy local_repo_credentials_key from backup + ansible.builtin.copy: + src: "{{ backup_location }}/.local_repo_credentials_key" + dest: "{{ input_project_dir }}/.local_repo_credentials_key" + mode: '0600' + remote_src: true + + - name: Display success message for encrypted file restoration + ansible.builtin.debug: + msg: | + user_registry_credential.yml restored from backup. + Backup: {{ backup_location }}/user_registry_credential.yml + Target: {{ input_project_dir }}/user_registry_credential.yml + Status: Encrypted (key file also restored) + rescue: + - name: Fail with decryption error message + ansible.builtin.fail: + msg: "{{ msg_user_registry_decrypt_error }}" + when: >- + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout + + - name: "Case 2: Both key and file missing - Add info warning" + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [ + "INFO: Both user_registry_credential.yml and .local_repo_credentials_key " + + "are not present in backup. This is expected if registry credentials " + + "were not configured in the source installation." + ] }} + when: >- + not backup_local_repo_credentials_key_stat.stat.exists and + (backup_user_registry_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and + "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))" + + - name: "Case 3: Error - Mismatched state" + ansible.builtin.fail: + msg: | + ERROR: Inconsistent state detected for user_registry_credential.yml: + {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %} + - File is encrypted but key file (.local_repo_credentials_key) is missing + {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} + - Key file exists but file is not encrypted + {% endif %} + Please check the backup integrity and ensure both files are present + in consistent states. + when: >- + (not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or + (backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) + when: backup_user_registry_credential_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml new file mode 100644 index 0000000000..4f6a96e83f --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml @@ -0,0 +1,33 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Read oim_metadata.yml to get upgrade_backup_dir + ansible.builtin.slurp: + src: /opt/omnia/.data/oim_metadata.yml + register: oim_metadata_slurp + +- name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" + +- name: Set backup_location from metadata + ansible.builtin.set_fact: + backup_location: "{{ oim_metadata.upgrade_backup_dir }}/input/project_default" + when: oim_metadata.upgrade_backup_dir is defined + +- name: Fail if upgrade_backup_dir is not defined in metadata + ansible.builtin.fail: + msg: "{{ msg_upgrade_backup_dir_missing }}" + when: oim_metadata.upgrade_backup_dir is not defined diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 new file mode 100644 index 0000000000..4b3b63d8c7 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 @@ -0,0 +1,48 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Provision credentials +provision_password: "{{ provision_password | default('') }}" +bmc_username: "{{ bmc_username | default('') }}" +bmc_password: "{{ bmc_password | default('') }}" + +# Prepare_oim credentials +minio_s3_password: "{{ minio_s3_password | default('') }}" +pulp_password: "{{ pulp_password | default('') }}" +docker_username: "{{ docker_username | default('') }}" +docker_password: "{{ docker_password | default('') }}" + +# Omnia credentials +slurm_db_password: "{{ slurm_db_password | default('') }}" + +# Security credentials +openldap_db_username: "{{ openldap_db_username | default('') }}" +openldap_db_password: "{{ openldap_db_password | default('') }}" + +# iDrac Telemetry credentials +mysqldb_user: "{{ mysqldb_user | default('') }}" +mysqldb_password: "{{ mysqldb_password | default('') }}" +mysqldb_root_password: "{{ mysqldb_root_password | default('') }}" + +# csi powerscale credentials +csi_username: "{{ csi_username | default('') }}" +csi_password: "{{ csi_password | default('') }}" + +# LDMS sampler +ldms_sampler_password: "{{ ldms_sampler_password | default('') }}" + +# postgres credentials +postgres_user: "{{ postgres_user | default('') }}" +postgres_password: "{{ postgres_password | default('') }}" diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index c27f111cde..5eee4a2f50 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -13,18 +13,82 @@ # limitations under the License. --- -backup_location: /opt/omnia/backups/upgrade/input/project_default +# backup_location will be set from oim_metadata.yml upgrade_backup_dir +# Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default +backup_location: "" backup_dir_mode: '0755' default_file_mode: '0644' +# List to collect warnings during execution +upgrade_warnings: [] + # Precheck backup location messages msg_backup_location_missing: "backup_location must be provided" +msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.data/oim_metadata.yml" # Restore input files messages msg_restore_item_name_missing: "restore_item must define 'name'" msg_validation_failed: "Validation failed for {{ restore_item.name }}" msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" +msg_user_registry_credential_missing: |- + \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m + This might be due to complete Omnia execution not being completed. + Skipping restoration of this file. + +# Omnia config credentials messages +msg_omnia_config_credentials_missing: |- + WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml. + This might be due to complete Omnia execution not being completed. + Skipping restoration of this file. + +msg_omnia_config_credentials_info_missing: |- + INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key + are not present in backup. This is expected if credentials + were not configured in the source installation. + +msg_omnia_config_credentials_success: |- + omnia_config_credentials.yml restored and updated from backup. + Backup: {{ backup_location }}/omnia_config_credentials.yml + Target: {{ input_project_dir }}/omnia_config_credentials.yml + Status: Updated with postgres credentials and re-encrypted (key file also restored) + +msg_omnia_config_credentials_error: |- + ERROR: Inconsistent state detected for omnia_config_credentials.yml: + {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %} + - File is encrypted but key file (.omnia_config_credentials_key) is missing + {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %} + - Key file exists but file is not encrypted + {% endif %} + Please check the backup integrity and ensure both files are present + in consistent states. + +# Rescue warning messages +msg_user_registry_decrypt_error: |- + ERROR: Failed to decrypt user_registry_credential.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file + matches the encrypted file. + +msg_omnia_config_decrypt_error: |- + ERROR: Failed to decrypt omnia_config_credentials.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file + matches the encrypted file. + +msg_omnia_config_template_error: |- + ERROR: Failed to generate updated omnia_config_credentials.yml. + Template processing may have failed due to invalid data format. + Please check the backup file format and ensure it contains valid YAML. + +msg_omnia_config_encrypt_error: |- + ERROR: Failed to encrypt updated omnia_config_credentials.yml. + The key file may be corrupted or there may be permission issues. + Please check the key file integrity and file permissions. + +msg_decryption_failed: "Decryption failed. Check warnings for details." +msg_template_failed: "Template processing failed. Check warnings for details." +msg_encryption_failed: "Encryption failed. Check warnings for details." # Network spec transformation messages msg_backup_network_spec_missing: "Backup network_spec.yml missing" From d3b9c749b5096eaa4ca708def872e51ad38e1ed4 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Thu, 12 Feb 2026 16:16:44 +0530 Subject: [PATCH 142/172] Added new package type rpm_repo Signed-off-by: pullan1 --- .../input_validation/common_utils/config.py | 1 + .../library/module_utils/local_repo/config.py | 2 +- .../local_repo/parse_and_download.py | 183 ++++++++++++------ .../module_utils/local_repo/software_utils.py | 6 +- common/library/modules/parallel_tasks.py | 163 ++++++++++------ common/library/modules/pulp_cleanup.py | 177 +++++++++++------ local_repo/pulp_cleanup.yml | 13 +- 7 files changed, 354 insertions(+), 191 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index e6e8a09042..0f369f3950 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -147,6 +147,7 @@ "rpm": ["package", "repo_name"], "rpm_list": ["package_list", "repo_name"], "rpm_file": ["package", "url"], + "rpm_repo": ["package", "repo_name"], "ansible_galaxy_collection": ["package", "version"], "git": ["package", "version", "url"], "image": ["package", ["tag", "digest"]], # Special: one of tag or digest diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 0518e2bb01..cfc3b20c9d 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -51,7 +51,7 @@ # Used by software_utils.py # ---------------------------- PACKAGE_TYPES = ['rpm', 'deb', 'tarball', 'image', 'manifest', 'git', - 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file'] + 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file', 'rpm_repo'] CSV_COLUMNS = {"column1": "name", "column2": "status"} SOFTWARE_CONFIG_SUBDIR = "config" RPM_LABEL_TEMPLATE = "RPMs for {key}" diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 367f9561f5..72efd4566b 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # pylint: disable=import-error,no-name-in-module +""" +Utility functions for parsing and downloading artifacts. + +This module provides common functions for command execution, status file management, +and repository operations used across the local repo management system. +""" + import os import subprocess import json import re from multiprocessing import Lock -from ansible.module_utils.local_repo.standard_logger import setup_standard_logger +from ansible.module_utils.local_repo.config import ARCH_SUFFIXES, STATUS_CSV_HEADER def mask_sensitive_data(cmd_string): @@ -57,35 +64,87 @@ def execute_command(cmd_string, logger, type_json=False): stderr=subprocess.PIPE, shell=True, ) - - status["returncode"] = cmd.returncode - status["stdout"] = cmd.stdout.strip() if cmd.stdout else None - status["stderr"] = cmd.stderr.strip() if cmd.stderr else None - - if cmd.returncode != 0: - logger.error(f"Command failed with return code {cmd.returncode}") - logger.error(f"Error: {status['stderr']}") - return False - - if type_json and status["stdout"]: - try: - status["stdout"] = json.loads(status["stdout"]) - except json.JSONDecodeError as error: - logger.error(f"Failed to parse JSON output: {error}") - return False - - return status - - except Exception as error: - logger.error(f"Error executing command: {error}") + logger.info(f"Command succeeded: {cmd_string}") + return True + except subprocess.CalledProcessError as e: + logger.error(f"Command failed: {cmd_string} - {e}") + return False + except subprocess.TimeoutExpired as e: + logger.error(f"Command timed out: {cmd_string} - {e}") + return False + except OSError as e: + logger.error(f"OS error during command: {cmd_string} - {e}") return False finally: logger.info("#" * 30 + f" {execute_command.__name__} end " + "#" * 30) +def get_arch_from_status_path(status_file_path): + """Extract architecture from status file path. + + Args: + status_file_path: Path like '/opt/omnia/log/local_repo/x86_64/software_name/status.csv' + + Returns: + str: Architecture ('x86_64' or 'aarch64') or None if not found + """ + for arch in ARCH_SUFFIXES: + if f"/{arch}/" in status_file_path: + return arch + return None + +def _prefix_repo_name_with_arch(repo_name: str, status_file_path: str, logger) -> str: + """Add architecture prefix to repo_name if not already present. + + Args: + repo_name: Repository name to prefix + status_file_path: Path to extract architecture from + logger: Logger instance + + Returns: + str: Repository name with architecture prefix + """ + if not repo_name: + return repo_name + + arch = get_arch_from_status_path(status_file_path) + if arch and not any(repo_name.startswith(f"{prefix}_") for prefix in ARCH_SUFFIXES): + prefixed_name = f"{arch}_{repo_name}" + logger.info(f"Auto-prefixed repo_name with architecture: {prefixed_name}") + return prefixed_name + return repo_name + + +def _update_existing_line(line: str, package_name: str, package_type: str, status: str, repo_name: str, status_file_path: str) -> str: + """Update an existing line in status file. + + Args: + line: Existing line content + package_name: Package name to match + package_type: Package type + status: New status + repo_name: Repository name + status_file_path: Path for architecture extraction + + Returns: + str: Updated line content + """ + parts = line.strip().split(',') + if len(parts) >= 4: + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + parts[2] = final_repo_name if final_repo_name else '' + parts[3] = status + return ','.join(parts) + '\n' + + # Handle short lines + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + return f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n" + + def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock, repo_name=None): """ - Writes or updates the status of a package in the status file, using a lock to ensure safe access across processes. + Writes or updates the status of a package in the status file. + Args: status_file_path: Path to the status file package_name: Name of the package @@ -97,44 +156,56 @@ def write_status_to_file(status_file_path, package_name, package_type, status, l """ logger.info("#" * 30 + f" {write_status_to_file.__name__} start " + "#" * 30) + # Auto-prefix repo_name with architecture if needed + repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, logger) + try: with file_lock: # Ensure only one process can write at a time if os.path.exists(status_file_path): - with open(status_file_path, "r") as f: - lines = f.readlines() - - updated = False - with open(status_file_path, "w") as f: - # Write header (new files always have repo_name column) - if lines: - f.write(lines[0]) # Keep existing header - - # Write data lines - for line in lines[1:]: # Skip header - if line.startswith(f"{package_name},"): - # f.write(f"{package_name},{package_type},{status}\n") - # Update existing line with repo_name (order: name,type,repo_name,status) - parts = line.strip().split(',') - if len(parts) >= 4: - parts[2] = repo_name if repo_name else '' - parts[3] = status - f.write(','.join(parts) + '\n') - else: - f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") - updated = True - else: - f.write(line) - - if not updated: - f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") + _update_existing_file(status_file_path, package_name, package_type, status, repo_name) else: - with open(status_file_path, "w") as f: - f.write(STATUS_CSV_HEADER) - f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") + _create_new_file(status_file_path, package_name, package_type, status, repo_name) logger.info(f"Status written to {status_file_path} for {package_name}.") - except Exception as e: + except OSError as e: logger.error(f"Failed to write to status file: {status_file_path}. Error: {str(e)}") - raise RuntimeError(f"Failed to write to status file: {status_file_path}. Error: {str(e)}") + raise RuntimeError( + f"Failed to write to status file: {status_file_path}. Error: {str(e)}" + ) from e finally: logger.info("#" * 30 + f" {write_status_to_file.__name__} end " + "#" * 30) + + +def _update_existing_file(status_file_path, package_name, package_type, status, repo_name): + """Update existing status file with new package status.""" + with open(status_file_path, "r", encoding='utf-8') as f: + lines = f.readlines() + + updated = False + with open(status_file_path, "w", encoding='utf-8') as f: + # Write header + if lines: + f.write(lines[0]) + + # Write data lines + for line in lines[1:]: # Skip header + if line.startswith(f"{package_name},"): + updated_line = _update_existing_line( + line, package_name, package_type, status, repo_name, status_file_path + ) + f.write(updated_line) + updated = True + else: + f.write(line) + + if not updated: + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n") + + +def _create_new_file(status_file_path, package_name, package_type, status, repo_name): + """Create new status file with package status.""" + with open(status_file_path, "w", encoding='utf-8') as f: + f.write(STATUS_CSV_HEADER) + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n") diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index a915f25f8b..3e06ddc7cd 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -179,7 +179,7 @@ def transform_package_dict(data, arch_val,logger): repo_mapping = {} for item in items: - if item.get("type") == "rpm": + if item.get("type") in ("rpm", "rpm_repo"): rpm_packages.append(item["package"]) # Preserve repo_name if available if "repo_name" in item: @@ -832,7 +832,7 @@ def remove_duplicates_from_trans(trans): if group == "default_packages": # Handle nested rpm_list case for pkg in items: - if pkg.get("type") == "rpm" and "rpm_list" in pkg: + if pkg.get("type") in ("rpm", "rpm_repo") and "rpm_list" in pkg: pkg["rpm_list"] = list(dict.fromkeys(pkg["rpm_list"])) continue @@ -856,7 +856,7 @@ def remove_duplicates_from_trans(trans): elif type_ == "git": key = (item.get("url"), item.get("version")) - elif type_ == "rpm" and "rpm_list" in item: + elif type_ in ("rpm", "rpm_repo") and "rpm_list" in item: item["rpm_list"] = list(dict.fromkeys(item["rpm_list"])) key = item.get("package") diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 5951a525b2..17c14cf51f 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -34,7 +34,9 @@ from ansible.module_utils.local_repo.download_image import process_image from ansible.module_utils.local_repo.download_rpm import process_rpm from ansible.module_utils.local_repo.standard_logger import setup_standard_logger -from ansible.module_utils.local_repo.common_functions import generate_vault_key, process_file, is_encrypted +from ansible.module_utils.local_repo.common_functions import ( + generate_vault_key, process_file, is_encrypted +) from ansible.module_utils.local_repo.software_utils import ( load_json, set_version_variables, @@ -125,7 +127,10 @@ def update_status_csv(csv_dir, software, overall_status,slogger): slogger.info(f"Successfully updated status CSV at {status_file}") -def determine_function(task, repo_store_path, csv_file_path, user_data, version_variables, arc, user_registries, docker_username, docker_password): +def determine_function( + task, repo_store_path, csv_file_path, user_data, version_variables, arc, + user_registries, docker_username, docker_password +): """ Determines the appropriate function and its arguments to process a given task. @@ -160,27 +165,55 @@ def determine_function(task, repo_store_path, csv_file_path, user_data, version_ task_type = task.get("type") if task_type == "manifest": - return process_manifest, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_manifest, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "git": - return process_git, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_git, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "tarball": - return process_tarball, [task, repo_store_path, status_file, version_variables, cluster_os_type, cluster_os_version, arc] + return process_tarball, [ + task, repo_store_path, status_file, version_variables, + cluster_os_type, cluster_os_version, arc + ] if task_type == "shell": - return process_shell, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_shell, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "ansible_galaxy_collection": - return process_ansible_galaxy_collection, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_ansible_galaxy_collection, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "iso": - return process_iso, [task, repo_store_path, status_file, - cluster_os_type, cluster_os_version, version_variables, arc] + return process_iso, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, version_variables, arc + ] if task_type == "pip_module": - return process_pip, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_pip, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "image": - return process_image, [task, status_file, version_variables, user_registries, docker_username, docker_password] + return process_image, [ + task, status_file, version_variables, user_registries, + docker_username, docker_password + ] if task_type == "rpm_file": - return process_rpm_file, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] - if task_type == "rpm": - return process_rpm, [task, repo_store_path, status_file, - cluster_os_type, cluster_os_version, repo_config_value, arc] + return process_rpm_file, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] + if task_type in ("rpm", "rpm_repo"): + return process_rpm, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, repo_config_value, arc + ] raise ValueError(f"Unknown task type: {task_type}") except Exception as e: @@ -272,57 +305,43 @@ def main(): Args: tasks (list): A list of tasks (dictionaries) that need to be processed in parallel. nthreads (int): The number of worker processes to run in parallel. - timeout (int): The maximum time allowed for all tasks to execute. If `None`, no timeout is enforced. + timeout (int): The maximum time allowed for all tasks to execute. + If `None`, no timeout is enforced. log_dir (str): The directory where log files for the worker processes will be saved. log_file (str): The path to the log file for the overall task execution. slog_file (str): The path to the log file for the standard logger. csv_file_path (str): The path to a CSV file that may be needed for processing some tasks. repo_store_path (str): The path to the repository where task-related files are stored. software (list): A list of software names. - user_json_file (str): The path to the JSON file containing use - show_softwares_status (bool): Whether to display the software status; optional, defaults to False. - overall_status_dict (dict): A list containing overall software status information; optional, defaults to an empty dict. - Dictionary containing software status information grouped by software names. - Each key (e.g., 'service_k8s') maps to a list of dictionaries, - where each dictionary contains: - - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'. - - 'overall_status' (str): Status of the software on that architecture, e.g., 'SUCCESS'. - Example: - { - "service_k8s": [ - {"arch": "x86_64", "overall_status": "SUCCESS"}, - {"arch": "aarch64", "overall_status": "SUCCESS"} - ] - } - Defaults to an empty dict if not provided. + user_json_file (str): The path to the JSON file containing user data. + show_softwares_status (bool): Whether to display the software status; + optional, defaults to False. + overall_status_dict (dict): A dictionary containing overall software status + information; optional, defaults to an empty dict. + Dictionary containing software status information grouped by software names. + Each key (e.g., 'service_k8s') maps to a list of dictionaries, + where each dictionary contains: + - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'. + - 'overall_status' (str): Status of the software on that architecture, + e.g., 'SUCCESS'. + Example: + { + "service_k8s": [ + {"arch": "x86_64", "overall_status": "SUCCESS"}, + {"arch": "aarch64", "overall_status": "SUCCESS"} + ] + } + Defaults to an empty dict if not provided. Returns: tuple: A tuple containing: - - overall_status (str): The overall status of task execution ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT"). - - task_results_data (list): A list of dictionaries, each containing the result of an individual task. + - overall_status (str): The overall status of task execution + ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT"). + - task_results_data (list): A list of dictionaries, each containing + the result of an individual task. Raises: Exception: If an error occurs during execution. """ - # module_args = { - # "tasks": {"type": "list", "required": True}, - # "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS}, - # "timeout": {"type": "int", "required": False, "default": DEFAULT_TIMEOUT}, - # "log_dir": {"type": "str", "required": False, "default": LOG_DIR_DEFAULT}, - # "log_file": {"type": "str", "required": False, "default": DEFAULT_LOG_FILE}, - # "slog_file": {"type": "str", "required": False, "default": DEFAULT_SLOG_FILE}, - # "csv_file_path": {"type": "str", "required": False, "default": CSV_FILE_PATH_DEFAULT}, - # "repo_store_path": {"type": "str", "required": False, "default": DEFAULT_REPO_STORE_PATH}, - # "software": {"type": "list", "elements": "str", "required": True}, - # "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT}, - # "show_softwares_status": {"type": "bool", "required": False, "default": False}, - # "overall_status_dict": {"type": "dict","required": True}, - # "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT}, - # "arch": {"type": "str", "required": False}, - # "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT}, - # "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH}, - # "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH}, - # "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH} - # } module_args = { "tasks": {"type": "list", "required": True}, @@ -337,10 +356,19 @@ def main(): "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT}, "show_softwares_status": {"type": "bool", "required": False, "default": False}, "overall_status_dict": {"type": "dict","required": True}, - "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT}, + "local_repo_config_path": { + "type": "str", "required": False, + "default": LOCAL_REPO_CONFIG_PATH_DEFAULT + }, "arch": {"type": "str", "required": False}, - "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH}, - "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH} + "omnia_credentials_yaml_path": { + "type": "str", "required": False, + "default": OMNIA_CREDENTIALS_YAML_PATH + }, + "omnia_credentials_vault_path": { + "type": "str", "required": False, + "default": OMNIA_CREDENTIALS_VAULT_PATH + } } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) tasks = module.params["tasks"] @@ -386,24 +414,29 @@ def main(): cluster_os_type = user_data['cluster_os_type'] cluster_os_version = user_data['cluster_os_version'] - subgroup_dict, software_names = get_subgroup_dict(user_data,slogger) - version_variables = set_version_variables(user_data, software_names, cluster_os_version,slogger) + subgroup_dict, software_names = get_subgroup_dict(user_data, slogger) + version_variables = set_version_variables( + user_data, software_names, cluster_os_version, slogger + ) slogger.info(f"Cluster OS: {cluster_os_type}") slogger.info(f"Version Variables: {version_variables}") # gen_result = {} # if not os.path.isfile(user_reg_key_path): # gen_result = generate_vault_key(user_reg_key_path) # if gen_result is None: - # module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}") + # module.fail_json( + # msg=f"Unable to generate local_repo key at path: {user_reg_key_path}" + # ) overall_status, task_results = execute_parallel( tasks, determine_function, nthreads, repo_store_path, csv_file_path, - log_dir, user_data, version_variables, arc, slogger, local_repo_config_path, - omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout + log_dir, user_data, version_variables, arc, slogger, + local_repo_config_path, omnia_credentials_yaml_path, + omnia_credentials_vault_path, timeout ) # if not is_encrypted(user_reg_cred_input): - # process_file(user_reg_cred_input,user_reg_key_path,'encrypt') + # process_file(user_reg_cred_input, user_reg_key_path, 'encrypt') end_time = datetime.now() formatted_end_time = end_time.strftime("%I:%M:%S %p") @@ -442,7 +475,9 @@ def main(): except Exception as e: - result["table_output"] = table_output if "table_output" in locals() else "No table generated." + result["table_output"] = ( + table_output if "table_output" in locals() else "No table generated." + ) slogger.error(f"Execution failed: {str(e)}") module.fail_json(msg=f"Error during execution: {str(e)}", **result) diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index f3da3e2004..a3c155ebdb 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -137,7 +137,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]: # Must contain at least one '/' to indicate registry/image format if '/' not in image_name: - return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)" + return False, ( + f"Invalid format '{image_name}'. Must include registry " + "(e.g., registry.k8s.io/pause, docker.io/library/busybox)" + ) # Must have a registry part (contains '.' or is a known registry) parts = image_name.split('/') @@ -145,7 +148,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]: # Check if registry looks valid (contains dot or is localhost) if '.' not in registry and registry != 'localhost' and ':' not in registry: - return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)" + return False, ( + f"Invalid registry '{registry}' in '{image_name}'. " + "Registry must be a domain (e.g., docker.io, registry.k8s.io)" + ) return True, "" @@ -173,7 +179,9 @@ def detect_file_type(name: str) -> str: if '==' in name: return "pip_module" # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix) - if '.' in name and '/' not in name and '==' not in name and any(x in name.lower() for x in ['ansible', 'community', 'galaxy']): + if '.' in name and '/' not in name and '==' not in name and any( + x in name.lower() for x in ['ansible', 'community', 'galaxy'] + ): return "ansible_galaxy_collection" if name.startswith('ansible_galaxy_collection'): return "ansible_galaxy_collection" @@ -296,7 +304,9 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] # Check existence if not container_exists(pulp_name, logger): - result["message"] = f"Container not found in Pulp (looked for: {pulp_name})" + result["message"] = ( + f"Container not found in Pulp (looked for: {pulp_name})" + ) return result try: @@ -368,7 +378,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) # 1. Remove content from repository if content_href: remove_result = run_cmd( - f"pulp file repository content remove --repository {repo_name} --href {content_href}", + f"pulp file repository content remove --repository {repo_name} " + f"--href {content_href}", logger ) if remove_result["rc"] == 0: @@ -376,7 +387,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) else: # Try alternative: modify repository to remove content run_cmd( - f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", + f"pulp file repository content modify --repository {repo_name} " + f"--remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", logger ) @@ -444,7 +456,9 @@ def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) messages.append("Orphan cleanup completed") else: # Try listing repos to find partial match - repo_list = run_cmd(pulp_python_commands["list_repositories"], logger) + repo_list = run_cmd( + pulp_python_commands["list_repositories"], logger + ) if repo_list["rc"] == 0: repos = safe_json_parse(repo_list["stdout"]) for repo in repos: @@ -533,7 +547,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor messages.append("Repository deleted") else: # Try listing repos to find partial match - repo_list = run_cmd(pulp_file_commands["list_repositories"], logger) + repo_list = run_cmd( + pulp_file_commands["list_repositories"], logger + ) if repo_list["rc"] == 0: repos = safe_json_parse(repo_list["stdout"]) for repo in repos: @@ -569,7 +585,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor mark_software_partial(affected, base_path, logger, file_type) # Clean up uploaded content from filesystem - fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger) + fs_result = cleanup_content_directory( + name, file_type, repo_store_path, logger + ) if fs_result["status"] == "Success": content_removed = True messages.append(fs_result["message"]) @@ -673,67 +691,82 @@ def cleanup_content_directory(content_name: str, content_type: str, repo_store_p # STATUS FILE UPDATES # ============================================================================= -def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[str]: +def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[str, List[str]]: """Remove RPMs that belong to a specific repository from status files. - + Uses the repo_name column in status.csv to accurately identify RPMs from the repository. - + Now that all repo_names include architecture prefixes, the logic is simplified. + Args: - repo_name: Repository name (e.g., 'x86_64_appstream') + repo_name: Repository name (e.g., 'x86_64_appstream', 'aarch64_epel') base_path: Base path for status files logger: Logger instance - + Returns: - List of software names that were affected + Dict mapping architecture to list of affected software names """ - affected_software = [] + affected_software = {} logger.info(f"Removing RPMs from status.csv for repository: {repo_name}") - try: - for arch in ARCH_SUFFIXES: - for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): - rows = [] - removed = False - has_repo_column = False - # Check if file has repo_name column - with open(status_file, 'r', encoding='utf-8') as f: - header = f.readline().strip().lower() - has_repo_column = "repo_name" in header + # Extract architecture from repo_name (all repo_names should now have arch prefixes) + target_arch = None + for arch in ARCH_SUFFIXES: + if repo_name.startswith(f"{arch}_"): + target_arch = arch + break + + if not target_arch: + logger.error(f"Repository name {repo_name} does not have architecture prefix") + return {} + + logger.info(f"Processing architecture: {target_arch}") + affected_software[target_arch] = [] + + try: + for status_file in glob.glob(f"{base_path}/{target_arch}/*/status.csv"): + rows = [] + removed = False + has_repo_column = False - with open(status_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - fieldnames = reader.fieldnames - for row in reader: - name = row.get('name', '') - row_type = row.get('type', '') - rpm_repo = row.get('repo_name', '') - - logger.info(f"Processing row: {row}") - # For RPMs, check if they belong to the deleted repository - if row_type == 'rpm' or row_type == 'rpm_file': - if has_repo_column and rpm_repo == repo_name: - removed = True - logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") - else: - rows.append(row) + # Check if file has repo_name column + with open(status_file, 'r', encoding='utf-8') as f: + header = f.readline().strip().lower() + has_repo_column = "repo_name" in header + + with open(status_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + name = row.get('name', '') + row_type = row.get('type', '') + rpm_repo = row.get('repo_name', '') + + logger.info(f"Processing row: {row}") + # For RPMs, check if they belong to the deleted repository + if row_type in ('rpm', 'rpm_repo', 'rpm_file'): + if has_repo_column and rpm_repo == repo_name: + removed = True + logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") else: rows.append(row) + else: + rows.append(row) - if removed and fieldnames: - with open(status_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) + if removed and fieldnames: + with open(status_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) - # Track affected software - software_name = os.path.basename(os.path.dirname(status_file)) - if software_name not in affected_software: - affected_software.append(software_name) + # Track affected software + software_name = os.path.basename(os.path.dirname(status_file)) + if software_name not in affected_software[target_arch]: + affected_software[target_arch].append(software_name) return affected_software except Exception as e: logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}") - return [] + return {} def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]: """Remove artifact from status.csv files and return affected software names by architecture. @@ -798,10 +831,10 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: def mark_software_partial(affected_software, base_path: str, logger, artifact_type: str = None): """Mark software entries as partial in software.csv. - + Args: - affected_software: Either a List[str] of software names (from remove_rpms_from_repository) - or a Dict[str, List[str]] mapping arch to software names (from remove_from_status_files) + affected_software: Either a List[str] of software names (legacy support) + or a Dict[str, List[str]] mapping arch to software names base_path: Base path for software.csv logger: Logger instance artifact_type: Type of artifact being removed (for logging purposes) @@ -811,8 +844,11 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty logger.info("No affected software to mark as partial") return - # Normalize input: if a flat list is passed, apply to all architectures + # Normalize input: convert to arch_software_map if needed if isinstance(affected_software, list): + # Legacy list input - this should not happen with new remove_rpms_from_repository + # but we keep it for backward compatibility + logger.warning("Received list input to mark_software_partial, applying to all architectures (legacy behavior)") arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES} else: arch_software_map = affected_software @@ -869,7 +905,7 @@ def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> with open(status_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: - if row.get('type', '').lower() == 'rpm': + if row.get('type', '').lower() in ('rpm', 'rpm_repo'): return True return False except OSError as e: @@ -892,7 +928,9 @@ def mark_all_software_partial(base_path: str, logger): try: for arch in ARCH_SUFFIXES: software_file = f"{base_path}/{arch}/software.csv" - logger.info(f"Processing software file: {software_file}") + logger.info( + f"Processing software file: {software_file}" + ) if not os.path.exists(software_file): logger.info(f"Software file not found: {software_file}") @@ -948,8 +986,12 @@ def run_module(): cleanup_repos=dict(type='list', elements='str', default=[]), cleanup_containers=dict(type='list', elements='str', default=[]), cleanup_files=dict(type='list', elements='str', default=[]), - base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT), - repo_store_path=dict(type='str', default='/opt/omnia') + base_path=dict( + type='str', default=CLEANUP_BASE_PATH_DEFAULT + ), + repo_store_path=dict( + type='str', default='/opt/omnia' + ) ), supports_check_mode=True ) @@ -966,16 +1008,25 @@ def run_module(): logger = setup_standard_logger(log_dir) # Handle 'all' keyword for repositories only - cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all' + cleanup_all_repos = ( + cleanup_repos and len(cleanup_repos) == 1 and + cleanup_repos[0].lower() == 'all' + ) #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all': if cleanup_all_repos: logger.info("cleanup_repos='all' - fetching all repositories from Pulp") cleanup_repos = get_all_repositories(logger) if not cleanup_repos: - module.fail_json(msg="Failed to retrieve repository list from Pulp. Please check if Pulp services are running.") + module.fail_json( + msg="Failed to retrieve repository list from Pulp. " + "Please check if Pulp services are running." + ) logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}") - logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}") + logger.info( + f"Starting cleanup - repos: {cleanup_repos}, " + f"containers: {cleanup_containers}, files: {cleanup_files}" + ) all_results = [] diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index 93e379833b..6f54e5f45f 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -15,10 +15,15 @@ # Pulp Cleanup Playbook - Clean Architecture # # Usage: -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel", "baseos"]}' -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_containers": ["nginx", "redis"]}' -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_files": ["git", "chart-0.48.0"]}' -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel"], "cleanup_containers": ["nginx"]}' -e force=true +# # Repository cleanup (include architecture prefix) +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel,aarch64_epel" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_appstream" +# ansible-playbook pulp_cleanup.yml -e "cleanup_containers=nginx,redis" +# ansible-playbook pulp_cleanup.yml -e "cleanup_files=git,chart-0.48.0" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel -e cleanup_containers=nginx -e force=true" +# +# # Examples: x86_64_epel, aarch64_epel, x86_64_appstream, aarch64_baseos +# # Note: Use architecture prefix (x86_64_ or aarch64_) for repository names - name: Pulp Cleanup hosts: localhost From 2898ff029a86ea9c326bea156f2162d9548e1d86 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Thu, 12 Feb 2026 17:36:48 +0530 Subject: [PATCH 143/172] input config changes Signed-off-by: pullan1 --- input/config/aarch64/rhel/10.0/slurm_custom.json | 5 +---- input/config/x86_64/rhel/10.0/slurm_custom.json | 5 +---- input/local_repo_config.yml | 4 +++- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 2483775495..2bdfda0ab9 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -9,10 +9,7 @@ {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", - "type": "iso", - "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" - } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 9531239fd2..8781885cca 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -7,10 +7,7 @@ {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", - "type": "iso", - "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" - } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 2f318f1deb..8428e6d94c 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -138,10 +138,12 @@ omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key'", name: "cri-o"} + - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"} omnia_repo_url_rhel_aarch64: - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/repodata/repomd.xml.key", name: "doca"} # Example: # additional_repos_x86_64: # - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" } From 680aef3efb7c0249d2d88447e9f0d7f83541a80f Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 17:44:57 +0530 Subject: [PATCH 144/172] Fixed ansible lint issues --- .../tasks/display_warnings.yml | 18 ++++------ .../restore_omnia_config_credentials.yml | 23 ++++++++----- .../restore_user_registry_credential.yml | 33 ++++++++++--------- .../import_input_parameters/vars/main.yml | 10 +++--- 4 files changed, 44 insertions(+), 40 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml index ac1eb69998..2cc6dfed26 100644 --- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -20,13 +20,11 @@ UPGRADE WARNINGS SUMMARY ================================= - {% if upgrade_warnings | length > 0 %} {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected. You will now be shown the detailed list. - {% else %} - No warnings detected. Upgrade completed successfully! - {% endif %} - when: upgrade_warnings is defined + when: + - upgrade_warnings is defined + - upgrade_warnings | length > 0 - name: Pause for user to review warnings @@ -36,7 +34,6 @@ ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ ╚════════════════════════════════════════════╝ - {% if upgrade_warnings | length > 0 %} {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected: {% for warning in upgrade_warnings %} @@ -45,9 +42,6 @@ Please review these warnings carefully. Press ENTER to continue or CTRL+C to abort. - {% else %} - No warnings detected. Upgrade completed successfully! - - Press ENTER to continue... - {% endif %} - when: upgrade_warnings is defined + when: + - upgrade_warnings is defined + - upgrade_warnings | length > 0 diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index 0abafee26b..71e8fb7db2 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -31,16 +31,21 @@ - not backup_omnia_config_credentials_stat.stat.exists - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))" +- name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_content + changed_when: false + failed_when: false + no_log: true + when: backup_omnia_config_credentials_stat.stat.exists + - name: Process omnia_config_credentials.yml when present in backup + when: >- + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout block: - - name: Check if backup file is encrypted - ansible.builtin.command: - cmd: cat "{{ backup_location }}/omnia_config_credentials.yml" - register: backup_omnia_config_credentials_content - changed_when: false - failed_when: false - no_log: true - - name: "Case 1: Key present and file encrypted - Process and update" block: - name: Copy encrypted omnia_config_credentials.yml from backup to temp location @@ -68,6 +73,7 @@ no_log: true register: vault_decrypt_result failed_when: vault_decrypt_result.rc != 0 + changed_when: false - name: Read decrypted content ansible.builtin.slurp: @@ -126,6 +132,7 @@ no_log: true register: vault_encrypt_result failed_when: vault_encrypt_result.rc != 0 + changed_when: false - name: Clean up temporary files ansible.builtin.file: diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index de337310b8..fe02a3d750 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -31,25 +31,26 @@ - name: Add warning for missing user_registry_credential.yml to list ansible.builtin.set_fact: upgrade_warnings: >- - {{ upgrade_warnings + [ - "WARNING: user_registry_credential.yml not found in backup at " + - backup_location + "/user_registry_credential.yml. " + - "This might be due to complete Omnia execution not being completed. " + - "Skipping restoration of this file." - ] }} - when: + {{ upgrade_warnings + [msg_user_registry_credential_missing] }} + when: - not backup_user_registry_credential_stat.stat.exists - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))" +- name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_content + changed_when: false + failed_when: false + no_log: true + when: backup_user_registry_credential_stat.stat.exists + - name: Process user_registry_credential.yml when present in backup + when: >- + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout block: - - name: Check if backup file is encrypted - ansible.builtin.command: - cmd: cat "{{ backup_location }}/user_registry_credential.yml" - register: backup_user_registry_content - changed_when: false - failed_when: false - no_log: true - name: "Case 1: Key present and file encrypted - Copy both" block: @@ -64,6 +65,7 @@ no_log: true register: vault_decrypt_result failed_when: vault_decrypt_result.rc != 0 + changed_when: false - name: Copy encrypted user_registry_credential.yml from backup ansible.builtin.copy: @@ -118,8 +120,7 @@ {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} - Key file exists but file is not encrypted {% endif %} - Please check the backup integrity and ensure both files are present - in consistent states. + Please check the backup integrity and ensure both files are present in consistent states. when: >- (not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 5eee4a2f50..9808da58bc 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -31,14 +31,16 @@ msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.dat msg_restore_item_name_missing: "restore_item must define 'name'" msg_validation_failed: "Validation failed for {{ restore_item.name }}" msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" -msg_user_registry_credential_missing: |- - \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m +msg_user_registry_credential_missing: |- + WARNING: user_registry_credential.yml not found in backup at + {{ backup_location }}/user_registry_credential.yml This might be due to complete Omnia execution not being completed. Skipping restoration of this file. # Omnia config credentials messages -msg_omnia_config_credentials_missing: |- - WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml. +msg_omnia_config_credentials_missing: |- + WARNING: omnia_config_credentials.yml not found in backup at + {{ backup_location }}/omnia_config_credentials.yml. This might be due to complete Omnia execution not being completed. Skipping restoration of this file. From ad7a5c08a6cf917814aefea6bef04145ad485534 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 18:05:14 +0530 Subject: [PATCH 145/172] fixed lint issues --- .../restore_omnia_config_credentials.yml | 34 +++++++-------- .../restore_user_registry_credential.yml | 43 +++++++++++-------- .../import_input_parameters/vars/main.yml | 2 +- 3 files changed, 42 insertions(+), 37 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index 71e8fb7db2..a129603dcc 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -27,7 +27,7 @@ ansible.builtin.set_fact: upgrade_warnings: >- {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }} - when: + when: - not backup_omnia_config_credentials_stat.stat.exists - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))" @@ -93,6 +93,10 @@ msg: "{{ msg_omnia_config_decrypt_error }}" - name: "Case 1.1: Apply template and encrypt" + when: > + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout block: - name: Set template variables from credentials ansible.builtin.set_fact: @@ -150,29 +154,25 @@ - name: Fail with template/encryption error message ansible.builtin.fail: msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}" - when: >- - backup_omnia_config_credentials_key_stat.stat.exists and - backup_omnia_config_credentials_content.stdout is defined and - '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout - name: "Case 2: Both key and file missing - Add info warning" - ansible.builtin.set_fact: - upgrade_warnings: >- - {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }} - when: >- + when: > not backup_omnia_config_credentials_key_stat.stat.exists and - (backup_omnia_config_credentials_content.stdout is not defined or + (backup_omnia_config_credentials_content.stdout is not defined or '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))" + ansible.builtin.set_fact: + upgrade_warnings: > + {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }} - name: "Case 3: Error - Mismatched state" - ansible.builtin.fail: - msg: "{{ msg_omnia_config_credentials_error }}" - when: >- - (not backup_omnia_config_credentials_key_stat.stat.exists and - backup_omnia_config_credentials_content.stdout is defined and + when: > + (not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or - (backup_omnia_config_credentials_key_stat.stat.exists and - backup_omnia_config_credentials_content.stdout is defined and + (backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) + ansible.builtin.fail: + msg: "{{ msg_omnia_config_credentials_error }}" when: backup_omnia_config_credentials_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index fe02a3d750..69a6a391a2 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -53,6 +53,10 @@ block: - name: "Case 1: Key present and file encrypted - Copy both" + when: > + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout block: - name: Decrypt user_registry_credential.yml using the key ansible.builtin.shell: @@ -92,12 +96,13 @@ - name: Fail with decryption error message ansible.builtin.fail: msg: "{{ msg_user_registry_decrypt_error }}" - when: >- - backup_local_repo_credentials_key_stat.stat.exists and - backup_user_registry_content.stdout is defined and - '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout - name: "Case 2: Both key and file missing - Add info warning" + when: >- + not backup_local_repo_credentials_key_stat.stat.exists and + (backup_user_registry_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and + "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))" ansible.builtin.set_fact: upgrade_warnings: >- {{ upgrade_warnings + [ @@ -105,27 +110,27 @@ "are not present in backup. This is expected if registry credentials " + "were not configured in the source installation." ] }} - when: >- - not backup_local_repo_credentials_key_stat.stat.exists and - (backup_user_registry_content.stdout is not defined or - '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and - "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))" - name: "Case 3: Error - Mismatched state" + when: >- + (not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or + (backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) ansible.builtin.fail: msg: | ERROR: Inconsistent state detected for user_registry_credential.yml: - {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %} + {% if not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %} - File is encrypted but key file (.local_repo_credentials_key) is missing - {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} + {% elif backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} - Key file exists but file is not encrypted {% endif %} - Please check the backup integrity and ensure both files are present in consistent states. - when: >- - (not backup_local_repo_credentials_key_stat.stat.exists and - backup_user_registry_content.stdout is defined and - '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or - (backup_local_repo_credentials_key_stat.stat.exists and - backup_user_registry_content.stdout is defined and - '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) + Please check the backup integrity and ensure both files are present + in consistent states. when: backup_user_registry_credential_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 9808da58bc..3bdf596641 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -32,7 +32,7 @@ msg_restore_item_name_missing: "restore_item must define 'name'" msg_validation_failed: "Validation failed for {{ restore_item.name }}" msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" msg_user_registry_credential_missing: |- - WARNING: user_registry_credential.yml not found in backup at + WARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml This might be due to complete Omnia execution not being completed. Skipping restoration of this file. From 31c5600391bad02cd31c9c2d3ad167100371f5d2 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 18:13:29 +0530 Subject: [PATCH 146/172] Fixed ansible lint issues --- .../restore_omnia_config_credentials.yml | 2 +- .../restore_user_registry_credential.yml | 2 +- .../import_input_parameters/vars/main.yml | 46 ++++++++++--------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index a129603dcc..e04964e461 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -175,4 +175,4 @@ '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) ansible.builtin.fail: msg: "{{ msg_omnia_config_credentials_error }}" - when: backup_omnia_config_credentials_stat.stat.exists + diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index 69a6a391a2..47b62fedb1 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -133,4 +133,4 @@ {% endif %} Please check the backup integrity and ensure both files are present in consistent states. - when: backup_user_registry_credential_stat.stat.exists + diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 3bdf596641..2bd20f0076 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -44,48 +44,52 @@ msg_omnia_config_credentials_missing: |- This might be due to complete Omnia execution not being completed. Skipping restoration of this file. -msg_omnia_config_credentials_info_missing: |- - INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key - are not present in backup. This is expected if credentials +msg_omnia_config_credentials_info_missing: |- + INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key + are not present in backup. This is expected if credentials were not configured in the source installation. -msg_omnia_config_credentials_success: |- +msg_omnia_config_credentials_success: |- omnia_config_credentials.yml restored and updated from backup. Backup: {{ backup_location }}/omnia_config_credentials.yml Target: {{ input_project_dir }}/omnia_config_credentials.yml Status: Updated with postgres credentials and re-encrypted (key file also restored) -msg_omnia_config_credentials_error: |- +msg_omnia_config_credentials_error: |- ERROR: Inconsistent state detected for omnia_config_credentials.yml: - {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %} + {% if not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %} - File is encrypted but key file (.omnia_config_credentials_key) is missing - {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %} + {% elif backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %} - Key file exists but file is not encrypted {% endif %} Please check the backup integrity and ensure both files are present in consistent states. # Rescue warning messages -msg_user_registry_decrypt_error: |- - ERROR: Failed to decrypt user_registry_credential.yml. - The backup key file may be corrupted or incompatible. - Please check the backup integrity and ensure the key file +msg_user_registry_decrypt_error: |- + ERROR: Failed to decrypt user_registry_credential.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file matches the encrypted file. -msg_omnia_config_decrypt_error: |- - ERROR: Failed to decrypt omnia_config_credentials.yml. - The backup key file may be corrupted or incompatible. - Please check the backup integrity and ensure the key file +msg_omnia_config_decrypt_error: |- + ERROR: Failed to decrypt omnia_config_credentials.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file matches the encrypted file. -msg_omnia_config_template_error: |- - ERROR: Failed to generate updated omnia_config_credentials.yml. - Template processing may have failed due to invalid data format. +msg_omnia_config_template_error: |- + ERROR: Failed to generate updated omnia_config_credentials.yml. + Template processing may have failed due to invalid data format. Please check the backup file format and ensure it contains valid YAML. -msg_omnia_config_encrypt_error: |- - ERROR: Failed to encrypt updated omnia_config_credentials.yml. - The key file may be corrupted or there may be permission issues. +msg_omnia_config_encrypt_error: |- + ERROR: Failed to encrypt updated omnia_config_credentials.yml. + The key file may be corrupted or there may be permission issues. Please check the key file integrity and file permissions. msg_decryption_failed: "Decryption failed. Check warnings for details." From da5423411cb969b8ddfd41856c195c4e8e443ac1 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 18:21:52 +0530 Subject: [PATCH 147/172] fixed ansible lint issues --- .../tasks/restore_omnia_config_credentials.yml | 1 - .../tasks/restore_user_registry_credential.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index e04964e461..6a20f371f8 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -175,4 +175,3 @@ '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) ansible.builtin.fail: msg: "{{ msg_omnia_config_credentials_error }}" - diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index 47b62fedb1..158b029ed3 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -133,4 +133,3 @@ {% endif %} Please check the backup integrity and ensure both files are present in consistent states. - From cdaa98d829d7e32ee0a13955145a96c6b67f25db Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Thu, 12 Feb 2026 19:05:57 +0530 Subject: [PATCH 148/172] offline build-image and discovery updates (#3956) * Use Pulp-hosted builder images for x86_64 builds * added x86_64 image-builder image * Update default_packages.json Signed-off-by: balajikumaran.cs * Refine image build prereqs and regctl handling * Update omnia_metadata_file path to use variable Signed-off-by: balajikumaran.cs * Airgap: move telemetry/NFS prep offline and package installs to prepare_oim * added nolog true * Update prepare_oim_completion.yml Signed-off-by: balajikumaran.cs * Update aarch64_prereq.yml Signed-off-by: balajikumaran.cs * Update main.yml Signed-off-by: balajikumaran.cs * Update main.yml Signed-off-by: balajikumaran.cs * Update main.yml Signed-off-by: balajikumaran.cs * Replace command with podman_image module for image tasks Signed-off-by: balajikumaran.cs * Replace Podman command with Ansible module Signed-off-by: balajikumaran.cs * Align podman image pull with retries and tagging for x86_64 and aarch64 * Fix podman tagging for x86_64 and aarch64 images --------- Signed-off-by: balajikumaran.cs --- .../roles/image_creation/vars/main.yml | 5 +- .../roles/prepare_arm_node/tasks/main.yml | 58 ++++++++------ .../roles/prepare_arm_node/vars/main.yml | 10 ++- build_image_x86_64/build_image_x86_64.yml | 4 +- .../image_creation/tasks/build_image_tag.yml | 28 ------- .../tasks/prepare_pulp_image.yml | 79 +++++++++++++++++++ .../roles/image_creation/vars/main.yml | 10 ++- .../roles/nfs_client/tasks/nfs_client.yml | 5 -- discovery/roles/nfs_client/vars/main.yml | 7 -- discovery/roles/telemetry/tasks/main.yml | 4 + .../telemetry/tasks/telemetry_prereq.yml | 27 ++++--- .../tasks/update_ldms_agg_config.yml | 5 -- discovery/roles/telemetry/vars/main.yml | 14 ++-- .../x86_64/rhel/10.0/default_packages.json | 3 +- prepare_oim/prepare_oim.yml | 10 +++ .../common/tasks/aarch64_prereq.yml | 26 ++++++ .../deploy_containers/common/tasks/main.yml | 2 +- .../common/tasks/package_installation.yml | 29 +++++++ .../common/tasks/prepare_oim_completion.yml | 20 ++++- .../deploy_containers/common/vars/main.yml | 28 ++++++- 20 files changed, 272 insertions(+), 102 deletions(-) delete mode 100644 build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml create mode 100644 build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml diff --git a/build_image_aarch64/roles/image_creation/vars/main.yml b/build_image_aarch64/roles/image_creation/vars/main.yml index 67d11422ef..984f2497d8 100644 --- a/build_image_aarch64/roles/image_creation/vars/main.yml +++ b/build_image_aarch64/roles/image_creation/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml" dir_permissions_644: "0644" dir_permissions_755: "0755" +aarch64_local_tag: "aarch64-image-builder/ochami" openchami_dir: "/opt/omnia/openchami" openchami_clone_path: /opt/omnia/openchami/deployment-recipes job_retry: "120" @@ -32,7 +33,7 @@ ochami_compute_mounts: - -v {{ openchami_work_dir }}/images/rhel-{{ item.key }}-{{ rhel_tag }}.yaml:/home/builder/config.yaml:z ochami_aarch64_image: - --entrypoint /bin/bash - - localhost/arm-image/ochami + - "localhost/{{ aarch64_local_tag }}" ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml index 1801448611..4a9d150850 100644 --- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml @@ -167,32 +167,42 @@ - name: Build full Podman image path ansible.builtin.set_fact: - pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.1" - -- name: Pull aarch64 image using Podman - ansible.builtin.command: - cmd: "podman pull {{ pulp_aarch_image }}" - register: podman_pull_result - ignore_errors: true - changed_when: false + pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/{{ pulp_aarch64_image_name }}" + +- name: Pull and tag aarch64 image + block: + - name: Pull aarch64 image using Podman + containers.podman.podman_image: + name: "{{ pulp_aarch_image }}" + state: present + register: podman_pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: podman_pull_result is not failed + changed_when: false + + - name: Tag pulled image + containers.podman.podman_tag: + image: "{{ pulp_aarch_image }}" + target_names: + - "{{ aarch64_local_tag }}" + changed_when: false + + rescue: + - name: Fail if Podman pull failed + ansible.builtin.fail: + msg: "Failed to pull image {{ pulp_aarch_image }}" + +- name: Check if regctl binary exists + ansible.builtin.stat: + path: "{{ ochami_aarch_64_dir }}/regctl" + register: regctl_stat + delegate_to: localhost -- name: Fail if Podman pull failed +- name: Fail if regctl binary not found ansible.builtin.fail: - msg: "{{ aarch64_image_fail_msg }}" - when: podman_pull_result.rc != 0 - -- name: Tag pulled image - ansible.builtin.command: - cmd: "podman tag {{ pulp_aarch_image }} arm-image/ochami" - when: podman_pull_result.rc == 0 - changed_when: false - -- name: Download regctl binary to NFS shared path - ansible.builtin.get_url: - url: "{{ aarch64_regctl_url }}" - dest: "{{ ochami_aarch_64_dir }}/regctl" - mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" - delegate_to: localhost + msg: "{{ regctl_not_found_msg }}" + when: not regctl_stat.stat.exists - name: Copy regctl binary to /usr/local/bin on target host ansible.builtin.copy: diff --git a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml index d240f27de4..c0ce2868aa 100644 --- a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,13 @@ # input files input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" +pulp_aarch64_image_name: "dellhpcomniaaisolution/image-build-aarch64:1.1" +aarch64_local_tag: "aarch64-image-builder/ochami" +pull_image_retries: "3" +pull_image_delay: "10" network_spec: "{{ input_project_dir }}/network_spec.yml" ochami_aarch_64_dir: "/opt/omnia/openchami/aarch64" pulp_repo_store_path: "{{ ochami_aarch_64_dir }}/pulp.repo" -aarch64_regctl_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" pulp_repo_file_path: "/etc/yum.repos.d/pulp.repo" pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" @@ -39,3 +42,6 @@ aarch64_image_fail_msg: > Unable to pull the Ochami aarch64 image builder image. Make sure you have added the default package for aarch64 in the software_config.json file and ran local_repo.yml. If not, add that package and rerun local_repo.yml. +regctl_not_found_msg: > + regctl binary not found at {{ ochami_aarch_64_dir }}/regctl. + Please run prepare_oim.yml playbook to download the regctl binary. diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml index 85ecaf93cd..676d8adbd6 100644 --- a/build_image_x86_64/build_image_x86_64.yml +++ b/build_image_x86_64/build_image_x86_64.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -80,7 +80,7 @@ - name: Tag OpenCHAMI image ansible.builtin.include_role: name: image_creation - tasks_from: build_image_tag.yml + tasks_from: prepare_pulp_image.yml - name: OpenCHAMI build image for x86_64 hosts: localhost diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml deleted file mode 100644 index 0b7a56072d..0000000000 --- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Pull image-build image - ansible.builtin.command: - cmd: "podman pull {{ image_build_el10 }}" - register: pull_result - retries: "{{ pull_image_retries }}" - delay: "{{ pull_image_delay }}" - until: pull_result.rc == 0 - changed_when: "'Image is up to date' not in pull_result.stdout" - -- name: Fail if image not pulled successfully - ansible.builtin.fail: - msg: "{{ pull_result.stdout }}" - when: pull_result.rc != 0 diff --git a/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml new file mode 100644 index 0000000000..22f336b849 --- /dev/null +++ b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml @@ -0,0 +1,79 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Load network specification +- name: Load network spec file + ansible.builtin.include_vars: + file: "{{ network_spec }}" + register: include_network_spec + no_log: true + +- name: Fail if network spec cannot be loaded + ansible.builtin.fail: + msg: "{{ network_spec_syntax_fail_msg }} Error: {{ include_network_spec.message }}" + when: include_network_spec is failed + +# Parse network spec data +- name: Parse network spec + ansible.builtin.set_fact: + network_data: "{{ network_data | default({}) | combine({item.key: item.value}) }}" + with_dict: "{{ Networks }}" + +# Set PXE IP fact +- name: Set PXE IP fact + ansible.builtin.set_fact: + oim_pxe_ip: "{{ network_data.admin_network.primary_oim_admin_ip }}" + cacheable: true + +# Copy pulp certificate and update CA trust +- name: Copy pulp webserver certificate to anchors + ansible.builtin.copy: + src: "{{ pulp_webserver_cert_path }}" + dest: "{{ anchors_path }}" + mode: "{{ dir_permissions_644 }}" + become: true + +- name: Update CA trust + ansible.builtin.command: update-ca-trust + register: update_ca + changed_when: false + +- name: Build full Podman image path for x86_64 + ansible.builtin.set_fact: + pulp_x86_image: "{{ oim_pxe_ip }}:2225/{{ pulp_x86_64_image_name }}" + +- name: Pull and tag x86_64 image + block: + - name: Pull x86_64 image using Podman + containers.podman.podman_image: + name: "{{ pulp_x86_image }}" + state: present + register: pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pull_result is not failed + changed_when: false + + - name: Tag pulled image for x86_64 build + containers.podman.podman_tag: + image: "{{ pulp_x86_image }}" + target_names: + - "{{ x86_64_local_tag }}" + changed_when: false + + rescue: + - name: Fail if Podman pull failed + ansible.builtin.fail: + msg: "Failed to pull image {{ pulp_x86_image }}." diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml index a05a39d37d..60dcf0bc6f 100644 --- a/build_image_x86_64/roles/image_creation/vars/main.yml +++ b/build_image_x86_64/roles/image_creation/vars/main.yml @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0" +pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.0" +x86_64_local_tag: "x86_64-image-builder/ochami" pull_image_retries: "3" pull_image_delay: "10" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" @@ -23,6 +24,9 @@ openchami_dir: "/opt/omnia/openchami" openchami_clone_path: /opt/omnia/openchami/deployment-recipes job_retry: "120" job_delay: "30" +network_spec: "{{ input_project_dir }}/network_spec.yml" +pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" +anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" ochami_mounts: - --user 0 --privileged @@ -35,7 +39,7 @@ ochami_compute_mounts: ochami_x86_64_image: - --entrypoint /bin/bash - - docker.io/dellhpcomniaaisolution/image-build-el10:1.0 + - "localhost/{{ x86_64_local_tag }}" ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' @@ -54,3 +58,5 @@ compute_image_failure_msg: | # build_compute_image.yml openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2" openchami_compute_image_vars_path: "/opt/omnia/openchami/compute_images_template.yaml" + +network_spec_syntax_fail_msg: "Failed to load network_spec.yml due to syntax error" diff --git a/discovery/roles/nfs_client/tasks/nfs_client.yml b/discovery/roles/nfs_client/tasks/nfs_client.yml index 079933c26b..ca8a3c7660 100644 --- a/discovery/roles/nfs_client/tasks/nfs_client.yml +++ b/discovery/roles/nfs_client/tasks/nfs_client.yml @@ -32,11 +32,6 @@ nfs_server_ip: "{{ hostvars['127.0.0.1']['admin_nic_ip'] }}" when: item.server_ip == "localhost" -- name: Package installation for NFS - ansible.builtin.package: - name: "{{ nfs_packages[ansible_os_family] }}" - state: present - - name: Mount facts items to dict ansible.builtin.set_fact: nfs_src: "{{ nfs_server_ip }}:{{ item.server_share_path }}" diff --git a/discovery/roles/nfs_client/vars/main.yml b/discovery/roles/nfs_client/vars/main.yml index b5e01fd82a..a3c20c054c 100644 --- a/discovery/roles/nfs_client/vars/main.yml +++ b/discovery/roles/nfs_client/vars/main.yml @@ -20,13 +20,6 @@ software_config_file: "{{ hostvars['localhost']['input_project_dir'] }}/software # Usage: nfs_client.yml mounted_dir_perm: "0755" default_client_mount_options: "nosuid,rw,sync,hard,intr" -nfs_packages: - RedHat: - - nfs-utils - - nfs4-acl-tools - Debian: - - nfs-common - - nfs4-acl-tools slurm_nfs_fail_msg: "Failed to mount NFS share. Please check if the NFS server is reachable or NFS is configured properly." omnia_config_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml index c5a3dbefba..825c3988d7 100644 --- a/discovery/roles/telemetry/tasks/main.yml +++ b/discovery/roles/telemetry/tasks/main.yml @@ -28,6 +28,10 @@ when: - hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] block: + - name: Set NFS info fact + ansible.builtin.set_fact: + oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" + - name: Service cluster prerequisite ansible.builtin.include_tasks: telemetry_prereq.yml diff --git a/discovery/roles/telemetry/tasks/telemetry_prereq.yml b/discovery/roles/telemetry/tasks/telemetry_prereq.yml index d720c57822..7eb45a89ab 100644 --- a/discovery/roles/telemetry/tasks/telemetry_prereq.yml +++ b/discovery/roles/telemetry/tasks/telemetry_prereq.yml @@ -47,23 +47,24 @@ state: directory mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" -- name: Git clone for iDRAC Telemetry script +- name: Ensure iDRAC Telemetry scripting destination exists + ansible.builtin.file: + path: "{{ idrac_telemetry_scripting_git_clone_path }}" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + +- name: Copy iDRAC Telemetry Scripting to NFS share block: - - name: Checkout iDRAC Telemetry GitHub repo - ansible.builtin.git: - repo: "{{ idrac_telemetry_scripting_repo }}" + - name: Copy pre-cloned iDRAC Telemetry Scripting directory + ansible.builtin.copy: + src: "{{ idrac_telemetry_scripting_src_path }}/" dest: "{{ idrac_telemetry_scripting_git_clone_path }}" - version: "{{ idrac_telemetry_scripting_stable_commit }}" - update: false - register: clone_idrac_script - until: clone_idrac_script is succeeded - retries: "{{ max_retries }}" - delay: "{{ delay_count }}" + remote_src: true + mode: preserve rescue: - - name: Fail if iDRAC telemetry Git clone fails + - name: Fail if iDRAC telemetry copy fails ansible.builtin.fail: - msg: "{{ idrac_script_git_clone_error_msg.splitlines() | join(' ') }}" - when: clone_idrac_script is failed + msg: "{{ idrac_telemetry_scripting_copy_fail_msg.splitlines() | join(' ') }}" - name: Set kafka_support to true ansible.builtin.set_fact: diff --git a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml index db4d4b1d3f..ee6c0c7d75 100644 --- a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml +++ b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Install make - ansible.builtin.package: - name: make - state: present - - name: Verify values.yaml exists ansible.builtin.stat: path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/values.yaml" diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml index 473fd74e19..5c5838ce29 100644 --- a/discovery/roles/telemetry/vars/main.yml +++ b/discovery/roles/telemetry/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,14 +32,12 @@ telemetry_namespace: "telemetry" idrac_telemetry_k8s_name: idrac-telemetry # iDRAC Telemetry scripting repository -idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git" -idrac_telemetry_scripting_stable_commit: "f6999f5" +idrac_telemetry_scripting_src_path: "{{ oim_shared_path }}/omnia/telemetry/iDRAC-Telemetry-Scripting" idrac_telemetry_scripting_git_clone_path: "{{ service_cluster_idrac_telemetry_dir_path }}/iDRAC-Telemetry-Scripting" -idrac_script_git_clone_error_msg: | - Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }} - to {{ idrac_telemetry_scripting_git_clone_path }} directory in NFS share. -max_retries: 10 -delay_count: 5 +idrac_telemetry_scripting_copy_fail_msg: | + Failed to copy iDRAC Telemetry Scripting from {{ idrac_telemetry_scripting_src_path }} + to {{ idrac_telemetry_scripting_git_clone_path }}. Please ensure prepare_oim.yml has been + executed successfully before running discovery. # Pre-built container images for iDRAC telemetry components # These default to your published images but can be overridden via telemetry_images diff --git a/input/config/x86_64/rhel/10.0/default_packages.json b/input/config/x86_64/rhel/10.0/default_packages.json index 813f9ad993..6002894568 100644 --- a/input/config/x86_64/rhel/10.0/default_packages.json +++ b/input/config/x86_64/rhel/10.0/default_packages.json @@ -34,7 +34,8 @@ {"package": "wget", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "cloud-init", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "glibc-langpack-en", "type": "rpm", "repo_name": "x86_64_baseos"}, - {"package": "gedit", "type": "rpm", "repo_name": "epel"} + {"package": "gedit", "type": "rpm", "repo_name": "epel"}, + {"package": "docker.io/dellhpcomniaaisolution/image-build-el10", "tag": "1.0", "type": "image" } ] } } diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index a78d21e8d9..50c48fd3e5 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -63,6 +63,11 @@ name: deploy_containers/common tasks_from: add_known_hosts.yml + - name: Download aarch64 prerequisites # noqa:role-name[path] + ansible.builtin.include_role: + name: deploy_containers/common + tasks_from: aarch64_prereq.yml + - name: OpenLDAP Pre_req generate ssha password hosts: localhost connection: local @@ -156,6 +161,11 @@ name: deploy_containers/common tasks_from: omnia_service.yml + - name: Install required packages # noqa:role-name[path] + ansible.builtin.include_role: + name: deploy_containers/common + tasks_from: package_installation.yml + - name: Prepare oim completion hosts: localhost connection: local diff --git a/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml new file mode 100644 index 0000000000..f5eae768bb --- /dev/null +++ b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml @@ -0,0 +1,26 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Create openchami aarch64 directory if not exists + ansible.builtin.file: + path: "{{ ochami_aarch64_dir }}" + state: directory + mode: "{{ dir_permissions_755 }}" + +- name: Download regctl binary (aarch64) + ansible.builtin.get_url: + url: "{{ regctl_aarch64_url }}" + dest: "{{ ochami_aarch64_dir }}/regctl" + mode: "{{ dir_permissions_755 }}" diff --git a/prepare_oim/roles/deploy_containers/common/tasks/main.yml b/prepare_oim/roles/deploy_containers/common/tasks/main.yml index 78c28e98ba..00287c628c 100644 --- a/prepare_oim/roles/deploy_containers/common/tasks/main.yml +++ b/prepare_oim/roles/deploy_containers/common/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml new file mode 100644 index 0000000000..1d84877307 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml @@ -0,0 +1,29 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install required packages + block: + - name: Install required packages + ansible.builtin.package: + name: "{{ item }}" + state: present + loop: "{{ oim_packages }}" + register: oim_pkg_result + rescue: + - name: Fail if required package installation fails + ansible.builtin.fail: + msg: >- + {{ prepare_oim_pkg_fail_msg.splitlines() | join(' ') }} + Failed package(s): {{ oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='item') | list | join(', ') }} + Error: {{ (oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='msg') | list | first) | default('') }} diff --git a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml index 7c86cfaf6b..52e4009219 100644 --- a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml +++ b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,6 +32,24 @@ mode: "{{ file_permissions }}" when: not bmc_group_data_status.stat.exists +- name: Clone iDRAC Telemetry Scripting repository + block: + - name: Checkout iDRAC Telemetry GitHub repo + ansible.builtin.git: + repo: "{{ idrac_telemetry_scripting_repo }}" + dest: "{{ idrac_telemetry_scripting_clone_dest }}" + version: "{{ idrac_telemetry_scripting_stable_commit }}" + update: false + register: clone_idrac_script + until: clone_idrac_script is succeeded + retries: "{{ max_retries }}" + delay: "{{ delay_count }}" + rescue: + - name: Fail if iDRAC telemetry Git clone fails + ansible.builtin.fail: + msg: "{{ idrac_script_git_clone_fail_msg.splitlines() | join(' ') }}" + when: clone_idrac_script is failed + - name: Prepare oim completion ansible.builtin.debug: msg: "{{ prepare_oim_completion_msg.splitlines() | join(' ') }}" diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml index 30bb7b8125..855e7350b1 100644 --- a/prepare_oim/roles/deploy_containers/common/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,12 +28,34 @@ internal_nfs_services: ntp_firewall_service: ntp +# Packages required on OIM +oim_packages: + - nfs-utils + - nfs4-acl-tools + - git + - make +prepare_oim_pkg_fail_msg: | + Failed to install required packages. Please ensure the repository is + configured on OIM and rerun the playbook. + # Usage: prepare_oim_completion.yml telemetry_dir: "/opt/omnia/telemetry" dir_permissions_755: "0755" bmc_group_data_filename: "{{ telemetry_dir }}/bmc_group_data.csv" bmc_group_data_template: "bmc_group_data.j2" file_permissions: "0644" +idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git" +idrac_telemetry_scripting_stable_commit: "f6999f5" +idrac_telemetry_scripting_clone_dest: "{{ telemetry_dir }}/iDRAC-Telemetry-Scripting" +max_retries: 10 +delay_count: 5 +git_install_timeout: 300 +git_install_fail_msg: | + Failed to install git. Please ensure the OS repository is configured on OIM. + Configure the repository and rerun the playbook. +idrac_script_git_clone_fail_msg: | + Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }} + to {{ idrac_telemetry_scripting_clone_dest }}. Please check network connectivity and rerun the playbook. prepare_oim_completion_msg: | The playbook prepare_oim.yml has completed successfully. To create the offline repositories and registry for the cluster nodes, please execute the playbook local_repo/local_repo.yml as the next step. @@ -58,3 +80,7 @@ network_services: # Usage: configure_chrony.yml chrony_conf_path: "/etc/chrony.conf" chrony_no_sources_msg: "No chrony sources are reachable. Please give a valid NTP server configuration in network_spec.yml and re-run prepare_oim playbook." + +# Usage: aarch64_prereq.yml +ochami_aarch64_dir: "/opt/omnia/openchami/aarch64" +regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" From 01dece90e8c421745419a1b81a46df85a3fa15eb Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 12 Feb 2026 19:24:06 +0530 Subject: [PATCH 149/172] Added flow if any munge key update, will be useful if munge key changes --- .../slurm_config/tasks/check_ctld_running.yml | 19 +---- discovery/roles/slurm_config/tasks/confs.yml | 2 +- .../slurm_config/tasks/create_slurm_dir.yml | 19 ++++- .../tasks/read_slurm_hostnames.yml | 1 + .../slurm_config/tasks/update_hosts_munge.yml | 84 +++++++++++++++++++ discovery/roles/slurm_config/vars/main.yml | 2 +- 6 files changed, 106 insertions(+), 21 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/update_hosts_munge.yml diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 0c7626f3dd..5f2d41a904 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -61,22 +61,11 @@ ansible.builtin.set_fact: reachable_hosts: "{{ ip_map_ssh_check.results | rejectattr('failed', 'true') | map(attribute='host') | list }}" - - name: Update /etc/hosts with controller hostname and IP - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}' - line: "{{ host_entry.value }} {{ host_entry.key }}" - state: present - loop: "{{ reachable_hosts | product(ip_name_map | dict2items) | list }}" + - name: Update basics on reachable_hosts + ansible.builtin.include_tasks: update_hosts_munge.yml + loop: "{{ reachable_hosts }}" loop_control: - loop_var: host_combo - vars: - target_host: "{{ host_combo[0] }}" - host_entry: "{{ host_combo[1] }}" - delegate_to: "{{ target_host }}" - when: reachable_hosts | length > 0 - ignore_unreachable: true - failed_when: false + loop_var: slurmhost_ip - name: Trigger the scontrol reconfigure ansible.builtin.command: scontrol reconfigure diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 12236d6ed8..799d4cd757 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -172,7 +172,7 @@ ansible.builtin.copy: content: "{{ item.ini_lines | join('\n') }}\n" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - mode: "{{ conf_file_mode }}" + mode: "0640" owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 81a08adfca..45e37ac243 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -84,11 +84,21 @@ share_prefix: "{{ slurm_config_path }}" when: conf_in_nfs -- name: Clear the share directory +- name: Clear Slurm-related files and directories ansible.builtin.file: - path: "{{ slurm_config_path }}" + path: "{{ slurm_config_path }}/{{ slurm_item }}" state: absent - when: clear_slurm_files + loop: "{{ (ctld_list | default([]) + + cmpt_list | default([]) + + login_list | default([]) + + compiler_login_list | default([]) + + dbd_list | default([]) + + ['munge.key']) | flatten }}" + loop_control: + loop_var: slurm_item + failed_when: false + when: + - clear_slurm_files - name: Create the slurm directory in share ansible.builtin.file: @@ -151,8 +161,9 @@ ansible.builtin.copy: src: "{{ slurm_config_path }}/munge.key" dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key" - mode: "{{ common_mode }}" + mode: "0600" remote_src: true + register: munge_key_copy loop: "{{ (ctld_list | default([])) + (cmpt_list | default([])) + (compiler_login_list | default([])) + diff --git a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml index df19821983..0f7b3a16b2 100644 --- a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -46,6 +46,7 @@ - name: Get bmc_ip ansible.builtin.set_fact: bmc_ip_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='bmc_ip') }}" + name_ip_map: "{{ dict(ip_name_map.values() | zip(ip_name_map.keys())) }}" - name: Assign slurm lists ansible.builtin.set_fact: diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml new file mode 100644 index 0000000000..ecaaad2beb --- /dev/null +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -0,0 +1,84 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Update /etc/hosts with controller hostname and IP + ansible.builtin.lineinfile: + path: /etc/hosts + regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}' + line: "{{ host_entry.value }} {{ host_entry.key }}" + state: present + loop: "{{ ip_name_map | dict2items | list }}" + loop_control: + loop_var: host_entry + ignore_unreachable: true + failed_when: false + delegate_to: "{{ slurmhost_ip }}" + +- name: Get munge changes + ansible.builtin.set_fact: + munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}" + when: munge_key_copy is defined + +- name: Block when munge key changed + when: + - munge_key_changed is defined + - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false) + - restart_slurm_services + delegate_to: "{{ slurmhost_ip }}" + ignore_errors: true + ignore_unreachable: true + block: + - name: Update munge key permissions + ansible.builtin.file: + path: /etc/munge/munge.key + owner: munge + group: munge + mode: '0600' + register: munge_key_permissions_result + + - name: Restart munge service if key changed + ansible.builtin.service: + name: munge + state: restarted + register: munge_restart_result + when: + - munge_key_permissions_result is defined + - munge_key_permissions_result is success + + - name: Restart slurmctld if munge restarted + ansible.builtin.service: + name: slurmctld + state: restarted + when: + - name_ip_map[slurmhost_ip] in ctld_list + - munge_restart_result is defined + - munge_restart_result is success + + - name: Restart slurmd if munge restarted + ansible.builtin.service: + name: slurmd + state: restarted + when: + - name_ip_map[slurmhost_ip] in (cmpt_list + login_list + compiler_login_list) + - munge_restart_result is defined + - munge_restart_result is success + + - name: Restart slurmdbd if munge restarted + ansible.builtin.service: + name: slurmdbd + state: restarted + when: + - name_ip_map[slurmhost_ip] in dbd_list + - munge_restart_result is defined + - munge_restart_result is success diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 43ee995e5a..93aa0d2786 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -79,7 +79,7 @@ cluster_name: cluster # TODO: direct load vars omnia_config.yml slurm_uid: 6001 slurm_user: slurm slurm_user_group: slurm -restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] }}" +restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] | default(true) }}" slurm_db_username: "{{ hostvars['localhost']['slurm_db_username'] | default('dbuser') }}" slurm_db_password: "{{ hostvars['localhost']['slurm_db_password'] }}" slurm_db_host: "{{ hostvars['localhost']['slurm_db_host'] | default(false) }}" From 19a000cb663e94ed23a2e15c866c67b2bf4b7d26 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 12 Feb 2026 19:44:38 +0530 Subject: [PATCH 150/172] lint issue fix --- discovery/roles/slurm_config/tasks/update_hosts_munge.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml index ecaaad2beb..a326fa820d 100644 --- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -36,7 +36,6 @@ - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false) - restart_slurm_services delegate_to: "{{ slurmhost_ip }}" - ignore_errors: true ignore_unreachable: true block: - name: Update munge key permissions @@ -82,3 +81,7 @@ - name_ip_map[slurmhost_ip] in dbd_list - munge_restart_result is defined - munge_restart_result is success + rescue: + - name: Handle munge restart failure + ansible.builtin.debug: + msg: "Failed task {{ ansible_failed_task.name }} on {{ slurmhost_ip }}" From 471d4e781435703aa2dba6d55e41139ca9a8ede7 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Thu, 12 Feb 2026 20:12:46 +0530 Subject: [PATCH 151/172] Update main.yml for copyright Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- discovery/roles/k8s_config/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index a80fb9b257..601cc07097 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 94a244fe9534c5feb3d950116c19e8f9b701aee9 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 21:55:11 +0530 Subject: [PATCH 152/172] centralize oim_metadata.yml path and remove static backup_location variable --- .../import_input_parameters/tasks/set_backup_location.yml | 2 +- upgrade/roles/import_input_parameters/vars/main.yml | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml index 4f6a96e83f..94156606e5 100644 --- a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml +++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml @@ -15,7 +15,7 @@ - name: Read oim_metadata.yml to get upgrade_backup_dir ansible.builtin.slurp: - src: /opt/omnia/.data/oim_metadata.yml + src: "{{ oim_metadata_path }}" register: oim_metadata_slurp - name: Parse oim_metadata.yml diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 2bd20f0076..ebaa33e492 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -15,7 +15,10 @@ # backup_location will be set from oim_metadata.yml upgrade_backup_dir # Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default -backup_location: "" +# Set dynamically from metadata, no static variable needed + +# Path to oim_metadata.yml +oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" backup_dir_mode: '0755' default_file_mode: '0644' From b64916bd08990d83d4f5cf0cd6895604c20f7d14 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Fri, 13 Feb 2026 10:02:03 +0530 Subject: [PATCH 153/172] Update omnia.sh --- omnia.sh | 77 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/omnia.sh b/omnia.sh index 9c46a04dc9..81e2094ccc 100755 --- a/omnia.sh +++ b/omnia.sh @@ -766,7 +766,7 @@ Description=${container_name^} Container [Container] ContainerName=${container_name} HostName=${container_name} -Image=${container_name}:1.1 +Image=${container_name}:2.1 Network=host # Capabilities @@ -1001,16 +1001,16 @@ install_omnia_core() { fi fi - local omnia_core_tag="1.1" + local omnia_core_tag="2.1" local omnia_core_registry="" - # Check if local omnia_core:1.1 exists + # Check if local omnia_core:2.1 exists if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" # Check if latest exists for backward compatibility elif podman inspect omnia_core:latest >/dev/null 2>&1; then echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}" - # Tag it as 1.1 for consistency + # Tag it as 2.1 for consistency podman tag omnia_core:latest omnia_core:${omnia_core_tag} else echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}" @@ -1018,11 +1018,11 @@ install_omnia_core() { echo "" echo -e "${YELLOW}One way to build the image locally:${NC}" echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" echo -e "2. Navigate to the repository directory:" echo -e " cd omnia-artifactory" echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core omnia_branch=" + echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" echo "" echo -e "${YELLOW}Then re-run:${NC}" echo -e " ./omnia.sh --install" @@ -1200,6 +1200,7 @@ phase1_validate() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running" + display_cleanup_instructions return 1 fi @@ -1249,9 +1250,19 @@ phase1_validate() { return 1 fi - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" - echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." + if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" + echo "" + echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" + echo "" + echo -e "${YELLOW}To build the core image locally:${NC}" + echo -e "1. Clone the Omnia Artifactory repository:" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" + echo -e "2. Navigate to the repository directory:" + echo -e " cd omnia-artifactory" + echo -e "3. Build the core image locally (loads into local Podman by default):" + echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" + echo "" return 1 fi @@ -1267,7 +1278,7 @@ phase2_approval() { echo "OMNIA UPGRADE SUMMARY" echo "============================================" echo "Current Container Tag: 1.0" - echo "Target Container Tag: 1.1" + echo "Target Container Tag: 2.1" echo "Current Omnia Release: 2.0.0.0" echo "Target Omnia Release: 2.1.0.0" echo "New Features:" @@ -1386,17 +1397,17 @@ phase4_container_swap() { return 1 fi - echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit" - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available" + echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit" + if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core return 1 fi - if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then - echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file" + if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core @@ -1413,13 +1424,13 @@ phase4_container_swap() { systemctl start omnia_core.service || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core return 1 } - echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 1.1 health check (60s)" + echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)" for i in $(seq 1 60); do if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then break @@ -1429,7 +1440,7 @@ phase4_container_swap() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core return 1 @@ -1607,6 +1618,23 @@ restore_from_backup() { return 0 } +# Display cleanup instructions for failed upgrade/rollback +display_cleanup_instructions() { + echo "" + echo -e "${RED}================================================================================${NC}" + echo -e "${RED} ROLLBACK FAILED${NC}" + echo -e "${RED}================================================================================${NC}" + echo "" + echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}" + echo "" + echo -e "${YELLOW}Run the following on the OIM host:${NC}" + echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf ${NC}" + echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}" + echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}" + echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}" + echo "" +} + # Main rollback function rollback_omnia_core() { echo -e "${GREEN}================================================================================${NC}" @@ -1695,11 +1723,12 @@ rollback_omnia_core() { echo "" echo "[INFO] [ROLLBACK] Starting rollback process..." - # Step 1: Stop 1.1 container gracefully + # Step 1: Stop 2.1 container gracefully echo "" - echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..." + echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..." if ! stop_container_gracefully "omnia_core" 30; then echo -e "${RED}ERROR: Failed to stop container.${NC}" + display_cleanup_instructions exit 1 fi @@ -1715,6 +1744,7 @@ rollback_omnia_core() { podman tag omnia_core:latest omnia_core:1.0 else echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}" + display_cleanup_instructions exit 1 fi fi @@ -1725,6 +1755,7 @@ rollback_omnia_core() { systemctl daemon-reload if ! systemctl start omnia_core.service; then echo -e "${RED}ERROR: Failed to start container service.${NC}" + display_cleanup_instructions exit 1 fi @@ -1747,6 +1778,7 @@ rollback_omnia_core() { if [ $health_count -ge $health_timeout ]; then echo "" echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}" + display_cleanup_instructions exit 1 fi @@ -1755,6 +1787,7 @@ rollback_omnia_core() { echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..." if ! validate_backup_directory "$selected_backup"; then echo -e "${RED}ERROR: Backup validation failed.${NC}" + display_cleanup_instructions exit 1 fi @@ -1763,6 +1796,7 @@ rollback_omnia_core() { echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..." if ! restore_from_backup "$selected_backup"; then echo -e "${RED}ERROR: Failed to restore from backup.${NC}" + display_cleanup_instructions exit 1 fi @@ -1773,6 +1807,7 @@ rollback_omnia_core() { if [ "$verify_version" != "$backup_version" ]; then echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}" + display_cleanup_instructions exit 1 fi From a39e26f82cbe954e492e6438a745dce13e042b1f Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 13 Feb 2026 06:38:40 +0000 Subject: [PATCH 154/172] updating /etc/hosts entries --- .../discovery_validations/tasks/update_hosts.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml index 43e7d3fc63..85c9ecf611 100644 --- a/discovery/roles/discovery_validations/tasks/update_hosts.yml +++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml @@ -13,16 +13,22 @@ # limitations under the License. --- -- name: Add hosts file entry for cluster +- name: Ensure 127.0.0.1 localhost entry exists ansible.builtin.shell: | set -o pipefail - grep -qxF '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' {{ hosts_file_path }} || \ - echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} + grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} + changed_when: true + +- name: Remove stale entries for IPs that are being updated + ansible.builtin.shell: | + set -o pipefail + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp changed_when: true loop: "{{ read_mapping_file.dict | dict2items }}" -- name: Ensure 127.0.0.1 localhost entry exists uniquely using echo +- name: Add hosts file entry for cluster ansible.builtin.shell: | set -o pipefail - grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} + echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" From 00fd2e2942b97d2610cb720ba4b647bde3d876c6 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Fri, 13 Feb 2026 12:43:26 +0530 Subject: [PATCH 155/172] Update service_k8s.json Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- input/config/x86_64/rhel/10.0/service_k8s.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index afc073a19f..0ef4408a7f 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -33,7 +33,7 @@ { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"} + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "service_kube_control_plane": { From 7b98e5ecd47d1d46b51aba587d4ee6eb99feeb7e Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 13 Feb 2026 07:19:48 +0000 Subject: [PATCH 156/172] lint issue fixed --- discovery/roles/discovery_validations/tasks/update_hosts.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml index 85c9ecf611..f040dd997f 100644 --- a/discovery/roles/discovery_validations/tasks/update_hosts.yml +++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml @@ -22,7 +22,9 @@ - name: Remove stale entries for IPs that are being updated ansible.builtin.shell: | set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp + cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} + rm -f {{ hosts_file_path }}.tmp changed_when: true loop: "{{ read_mapping_file.dict | dict2items }}" From 6ff5423831736dc86ea5227bd1702b553ccf81af Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Fri, 13 Feb 2026 07:26:03 +0000 Subject: [PATCH 157/172] Add user registry to crio.conf Signed-off-by: Vrinda_Marwah --- .../tasks/fetch_additional_images.yml | 9 +++++++++ ...ervice_kube_control_plane_first_x86_64.yaml.j2 | 15 ++++++++++++--- ...roup-service_kube_control_plane_x86_64.yaml.j2 | 15 ++++++++++++--- .../ci-group-service_kube_node_x86_64.yaml.j2 | 14 +++++++++++--- discovery/roles/configure_ochami/vars/main.yml | 1 + 5 files changed, 45 insertions(+), 9 deletions(-) diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml index 2fecb895e8..ca13f0c414 100644 --- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml +++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml @@ -42,3 +42,12 @@ ansible.builtin.debug: var: additional_images_dict verbosity: 2 + +- name: Read local_repo_config.yml + ansible.builtin.include_vars: + file: "{{ local_repo_config_path }}" + name: local_repo_config + +- name: Set fact for user_registry + ansible.builtin.set_fact: + user_registry: "{{ local_repo_config.user_registry | default([]) }}" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index b8b71bf099..b98df53d7d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -169,6 +169,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} - path: /tmp/kube-vip.yaml owner: root:root @@ -415,13 +425,12 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - mv /tmp/generate-control-plane-join.sh {{ k8s_client_mount_path }} - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_control_plane_first' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index f3ba7a7330..922f63f852 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -147,6 +147,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} - path: /tmp/kube-vip.yaml owner: root:root permissions: '0644' @@ -323,12 +333,11 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_control_plane' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index b380030ddd..df98035baa 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -146,7 +146,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} runcmd: - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" @@ -226,12 +235,11 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_node' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index 7f75daa01d..053ee15c0d 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -108,3 +108,4 @@ cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cud # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" software_config_file_path: "{{ input_project_dir }}/software_config.json" +local_repo_config_path: "{{ input_project_dir }}/local_repo_config.yml" From a70b838c3a7e4707d0f0235b0c350e13d598c36f Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 13 Feb 2026 08:14:30 +0000 Subject: [PATCH 158/172] duplicated hostnames --- discovery/roles/discovery_validations/tasks/update_hosts.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml index f040dd997f..bd046032bc 100644 --- a/discovery/roles/discovery_validations/tasks/update_hosts.yml +++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml @@ -19,10 +19,11 @@ grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} changed_when: true -- name: Remove stale entries for IPs that are being updated +- name: Remove stale entries for IPs and hostnames that are being updated ansible.builtin.shell: | set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ + grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} rm -f {{ hosts_file_path }}.tmp changed_when: true From aba17ded12da3c66de984e0cabb6dce24f7ca1a4 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Fri, 13 Feb 2026 14:05:55 +0530 Subject: [PATCH 159/172] Update omnia.sh --- omnia.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/omnia.sh b/omnia.sh index 81e2094ccc..b7a086545d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -164,7 +164,7 @@ setup_omnia_core() { # It removes the container and performs the necessary cleanup steps. cleanup_omnia_core() { # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" @@ -272,7 +272,7 @@ cleanup_config(){ # Otherwise, it prints an error message. remove_container() { # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" @@ -1083,7 +1083,7 @@ install_omnia_core() { # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function if [ "$choice" = "2" ]; then # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" From 7c79b599c8fd89b75cdaf2eb082d9b95449cf84a Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Fri, 13 Feb 2026 08:47:06 +0000 Subject: [PATCH 160/172] resolve input validation + lint Signed-off-by: Vrinda_Marwah --- .../validation_flows/common_validation.py | 13 +++++++++++++ .../tasks/fetch_additional_images.yml | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 198c527440..f577a4e9b8 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -233,6 +233,19 @@ def validate_software_config( ) ) + # Check for required subgroups when specific software names are present + software_requiring_subgroups = ["additional_packages", "slurm_custom", "service_k8s"] + for software_name in software_requiring_subgroups: + if software_name in software_names: + if software_name not in data or not data[software_name]: + errors.append( + create_error_msg( + "Validation Error: ", + software_name, + f"is present in softwares but corresponding subgroup '{software_name}' is missing or empty in software_config.json. Please refer examples directory for the correct format." + ) + ) + for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml index ca13f0c414..d4e8425749 100644 --- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml +++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml @@ -43,11 +43,11 @@ var: additional_images_dict verbosity: 2 -- name: Read local_repo_config.yml +- name: Read local_repo_config.yml ansible.builtin.include_vars: file: "{{ local_repo_config_path }}" name: local_repo_config - name: Set fact for user_registry ansible.builtin.set_fact: - user_registry: "{{ local_repo_config.user_registry | default([]) }}" \ No newline at end of file + user_registry: "{{ local_repo_config.user_registry | default([]) }}" From 40f1595cd15c9f59b4c653c679a0acfaa1eb6c57 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 13 Feb 2026 16:09:23 +0530 Subject: [PATCH 161/172] Removed slurmd dependency issue where ssh key changes on slurmctld, live --- ...-group-login_compiler_node_aarch64.yaml.j2 | 8 +++-- ...i-group-login_compiler_node_x86_64.yaml.j2 | 8 +++-- .../ci-group-login_node_aarch64.yaml.j2 | 7 +++- .../ci-group-login_node_x86_64.yaml.j2 | 7 +++- .../ci-group-slurm_node_aarch64.yaml.j2 | 8 +++-- .../ci-group-slurm_node_x86_64.yaml.j2 | 7 ++-- .../slurm_config/tasks/check_ctld_running.yml | 32 +++++++++++++------ discovery/roles/slurm_config/tasks/confs.yml | 2 ++ .../slurm_config/tasks/create_slurm_dir.yml | 12 +------ .../slurm_config/tasks/update_hosts_munge.yml | 1 + .../slurm_config/templates/slurmd.service.j2 | 22 ------------- 11 files changed, 62 insertions(+), 52 deletions(-) delete mode 100644 discovery/roles/slurm_config/templates/slurmd.service.j2 diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index dc2ddf9dcd..8918f03050 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -209,6 +209,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -278,12 +284,10 @@ {% if hostvars['localhost']['ldms_support'] %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - /root/ldms_sampler.sh {% endif %} - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 2c23b868c0..51121a2e82 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -209,6 +209,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -282,12 +288,10 @@ {% if hostvars['localhost']['ldms_support'] %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - /root/ldms_sampler.sh {% endif %} - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 8b3d771592..4aacc2222d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -102,6 +102,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -131,7 +137,6 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 4e68ba8d81..524553bd55 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -108,6 +108,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -142,7 +148,6 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 06a04a6068..dacade639b 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -277,8 +277,6 @@ echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) (aarch64) =====" - echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/" - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" @@ -415,6 +413,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index c1b532908e..d21fcf9c5c 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -244,6 +244,11 @@ {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' @@ -288,8 +293,6 @@ echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) =====" - echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/" - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 5f2d41a904..7d908169ab 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -14,30 +14,37 @@ --- - name: Check if remote host is reachable via SSH ansible.builtin.wait_for: - host: "{{ item }}" + host: "{{ ctld }}" port: 22 # TODO: make it configurable timeout: 10 state: started delegate_to: localhost register: ssh_check ignore_errors: true - ignore_unreachable: true -- name: Block when ssh_check is success - when: ssh_check is success +- name: Enter slurm controller when pingable + when: + - ssh_check is success + ignore_unreachable: true block: - name: Initialize ctld_state dict ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}" + ctld_state: "{{ ctld_state | default({}) | combine({ctld: false}) }}" - name: Check if slurmctld is running on remote host ansible.builtin.service_facts: - delegate_to: "{{ item }}" + delegate_to: "{{ ctld }}" register: service_facts + ignore_unreachable: true + + - name: Fail if slurmctld is unreachable + ansible.builtin.fail: + msg: "Failed to connect to {{ ctld }}." + when: service_facts is unreachable - name: Update ctld_state if slurmctld is running ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | combine({item: true}) }}" + ctld_state: "{{ ctld_state | combine({ctld: true}) }}" when: - service_facts is success - ansible_facts.services['slurmctld.service'] is defined @@ -72,6 +79,13 @@ changed_when: scontrol_reconfig.rc == 0 failed_when: false register: scontrol_reconfig - delegate_to: "{{ item }}" + delegate_to: "{{ ctld }}" when: - - ctld_state[item] is true + - ctld_state[ctld] is true + + rescue: + - name: Fail if slurmctld is not running on any host + ansible.builtin.debug: + msg: "Failed to 'scontrol reconfigure' on {{ ctld }}. + As task '{{ ansible_failed_task.name }}' failed. + results: {{ ansible_failed_result }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 799d4cd757..c5f7953b0d 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -197,3 +197,5 @@ - ctld_list - ctld_conf_files is changed loop: "{{ ctld_list }}" + loop_control: + loop_var: ctld diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 45e37ac243..e4ac760d77 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -194,17 +194,7 @@ group: "{{ root_group }}" mode: "{{ common_mode }}" when: cmpt_list - loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}" - -- name: Create logout_user.sh and slurmd.service in login and login_compiler - ansible.builtin.template: - src: "{{ item.1 }}.j2" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" - when: login_list or compiler_login_list - loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}" + loop: "{{ cmpt_list | product(['logout_user.sh']) }}" - name: Get the slurm NFS path ansible.builtin.debug: diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml index a326fa820d..64c36dbeaf 100644 --- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -30,6 +30,7 @@ munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}" when: munge_key_copy is defined +# TODO: Clean unreachable handling - name: Block when munge key changed when: - munge_key_changed is defined diff --git a/discovery/roles/slurm_config/templates/slurmd.service.j2 b/discovery/roles/slurm_config/templates/slurmd.service.j2 deleted file mode 100644 index 294d1fda75..0000000000 --- a/discovery/roles/slurm_config/templates/slurmd.service.j2 +++ /dev/null @@ -1,22 +0,0 @@ -[Unit] -Description=Slurm node daemon -After=munge.service network-online.target remote-fs.target sssd.service -Wants=network-online.target - -[Service] -Type=notify -EnvironmentFile=-/etc/sysconfig/slurmd -EnvironmentFile=-/etc/default/slurmd -RuntimeDirectory=slurm -RuntimeDirectoryMode=0755 -ExecStart=/usr/sbin/slurmd --systemd $SLURMD_OPTIONS {{ conf_server }} -ExecReload=/bin/kill -HUP $MAINPID -KillMode=process -LimitNOFILE=131072 -LimitMEMLOCK=infinity -LimitSTACK=infinity -Delegate=yes -TasksMax=infinity - -[Install] -WantedBy=multi-user.target \ No newline at end of file From 3f516a3dd38d4923dd318e9600fb110f457700cf Mon Sep 17 00:00:00 2001 From: pullan1 Date: Fri, 13 Feb 2026 20:32:27 +0530 Subject: [PATCH 162/172] Fix for local repo is failing as cuda run package download issue Signed-off-by: pullan1 --- .../local_repo/parse_and_download.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 72efd4566b..c8b8278eef 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -64,6 +64,26 @@ def execute_command(cmd_string, logger, type_json=False): stderr=subprocess.PIPE, shell=True, ) + status["returncode"] = cmd.returncode + status["stdout"] = cmd.stdout.strip() if cmd.stdout else None + status["stderr"] = cmd.stderr.strip() if cmd.stderr else None + + if cmd.returncode != 0: + logger.error(f"Command failed with return code {cmd.returncode}") + logger.error(f"Error: {status['stderr']}") + return False + + if type_json: + if not status["stdout"]: + logger.error("Command succeeded but returned empty output when JSON was expected") + return False + try: + status["stdout"] = json.loads(status["stdout"]) + except json.JSONDecodeError as error: + logger.error(f"Failed to parse JSON output: {error}") + logger.error(f"Raw output was: {status['stdout']}") + return False + logger.info(f"Command succeeded: {cmd_string}") return True except subprocess.CalledProcessError as e: From f531576a0a3ff35bb969225716f15b73c1329ce7 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 14:27:10 +0530 Subject: [PATCH 163/172] Addition of user guidance messages for cluster reprovisioning and rollback after upgrade to 2.1 (#3978) * Added user guidance messages in rollback_omnia.yml and upgrade_cluster.yml * Modification of Rollback guidance message * Update main.yml * Update main.yml * Update main.yml * Update main.yml * Update main.yml * Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 55 ++++++++++++++++++-- upgrade/rollback_omnia.yml | 54 +++++++++++++++++++ 2 files changed, 106 insertions(+), 3 deletions(-) create mode 100644 upgrade/rollback_omnia.yml diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 196366870b..90b25611b5 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -13,6 +13,55 @@ # limitations under the License. --- -- name: Include import input parameters - ansible.builtin.include_role: - name: import_input_parameters + +- name: Display cluster reprovision guidance + ansible.builtin.pause: + prompt: "{{ '\x1b[32m' }}=================================================== + CLUSTER REPROVISION REQUIRED + =========================================================== + + Cluster reprovisioning is required after upgrade to enable new features. + + Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning: + + 1. local_repo_config.yml + + - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64) + + - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64) + + 2. network_spec.yml (ib_network section) + + - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable) + + - Ensure host IB interfaces map to the IB network entries + + 3. omnia_config.yml (slurm_cluster.config_source) + + - Use the new structure: config_source: { type: , location: } + + - Populate location to point to your Slurm config bundle (local path or remote URL) + + Do NFS cleanup (if NFS share is used for k8s/slurm) + + - Clean stale mounts and ensure the NFS share is accessible before reprovision + + - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment + + + Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster: + + 1. ansible-playbook local_repo/local_repo.yml + + 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml + + 3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64: + + ansible-playbook build_image_aarch64/build_image_aarch64.yml + + 4. ansible-playbook discovery/discovery.yml + + Please follow the omnia documentation for steps in more detail. + + {{ '\x1b[0m' }}" + seconds: 1 diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml new file mode 100644 index 0000000000..c0d5080c22 --- /dev/null +++ b/upgrade/rollback_omnia.yml @@ -0,0 +1,54 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Rollback Omnia guidance + hosts: localhost + connection: local + gather_facts: false + vars: + oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" + tasks: + - name: Read oim_metadata.yml for backup details + ansible.builtin.slurp: + src: "{{ oim_metadata_path }}" + register: oim_metadata_slurp + ignore_errors: true + + - name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" + when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined + + - name: Derive backup_version from upgrade_backup_dir + ansible.builtin.set_fact: + backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1')) + | default('previous version', true) }}" + when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined + + - name: Display rollback guidance (green) + ansible.builtin.debug: + msg: + - "=================================" + - " OMNIA ROLLBACK" + - "=================================" + - "" + - "[Rollback Actions]" + - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)." + - "2. Target version: {{ backup_version | default('previous version from the backup location') }}." + - "3. How to run:" + - " - Exit the Omnia core container shell if you are inside it." + - " - From the OIM host prompt, execute: ./omnia.sh --rollback" + - "4. Note: ensure the backup location is accessible on the OIM host before running rollback." + - name: End play + ansible.builtin.meta: end_play From 8066a19d5542f2acaaf042e8dd5ccb92cdbb9b32 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Mon, 16 Feb 2026 12:04:27 +0000 Subject: [PATCH 164/172] fix status return in execute command Signed-off-by: Vrinda_Marwah --- common/library/module_utils/local_repo/parse_and_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index c8b8278eef..15bed1efb3 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -85,7 +85,7 @@ def execute_command(cmd_string, logger, type_json=False): return False logger.info(f"Command succeeded: {cmd_string}") - return True + return status except subprocess.CalledProcessError as e: logger.error(f"Command failed: {cmd_string} - {e}") return False From f0928443075d08a01973bb8b6f3921d9b16c0ea4 Mon Sep 17 00:00:00 2001 From: Nethravathi M G <146437298+nethramg@users.noreply.github.com> Date: Mon, 16 Feb 2026 23:12:44 +0530 Subject: [PATCH 165/172] Initial iDRAC Telemetry Node addition and deletion changes (#3972) * Initial set of changes for iDRAC Telemetry add and remove node * Ansible link and pylint fixes * Ansible lint fixes * Updated Copyrights to 2026 * Addressed the comments --- .../modules/delete_idracips_from_mysqldb.py | 251 ++++++++++++++++++ .../modules/disable_idrac_telemetry.py | 184 +++++++++++++ .../initiate_telemetry_service_cluster.yml | 5 +- .../tasks/remove_deleted_nodes.yml | 101 +++++++ .../templates/telemetry_report.j2 | 18 ++ telemetry/roles/idrac_telemetry/vars/main.yml | 24 +- 6 files changed, 581 insertions(+), 2 deletions(-) create mode 100644 common/library/modules/delete_idracips_from_mysqldb.py create mode 100644 common/library/modules/disable_idrac_telemetry.py create mode 100644 telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml diff --git a/common/library/modules/delete_idracips_from_mysqldb.py b/common/library/modules/delete_idracips_from_mysqldb.py new file mode 100644 index 0000000000..cd81b943e2 --- /dev/null +++ b/common/library/modules/delete_idracips_from_mysqldb.py @@ -0,0 +1,251 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +"""Module to delete iDRAC IPs from MySQL database. +This module connects to a Kubernetes pod running MySQL and deletes iDRAC IPs +that are not present in bmc_data.csv. It handles retries and delays for robustness.""" + +import time +from ansible.module_utils.basic import AnsibleModule +from kubernetes import client, config +from kubernetes.stream import stream +from kubernetes.config.config_exception import ConfigException + + +def load_kube_context(): + """Load Kubernetes configuration for accessing the cluster.""" + try: + config.load_kube_config() + except ConfigException: + config.load_incluster_config() + + +def run_mysql_query_in_pod(namespace, pod, container, mysql_user, mysql_password, query): + """Run a MySQL query in the specified pod. + + Args: + namespace: Kubernetes namespace + pod: Pod name + container: Container name + mysql_user: MySQL username + mysql_password: MySQL password + query: MySQL query to execute + + Returns: + dict: Result containing return code and output + """ + core_v1 = client.CoreV1Api() + mysql_command = [ + "mysql", + "-u", mysql_user, + "-N", "-B", + f"-p{mysql_password}", + "-e", query + ] + + try: + ws = stream( + core_v1.connect_get_namespaced_pod_exec, + name=pod, + namespace=namespace, + container=container, + command=mysql_command, + stderr=True, + stdin=False, + stdout=True, + tty=False, + _preload_content=False + ) + + stdout = "" + stderr = "" + + while ws.is_open(): + ws.update(timeout=1) + if ws.peek_stdout(): + stdout += ws.read_stdout() + if ws.peek_stderr(): + stderr += ws.read_stderr() + ws.close() + + rc = ws.returncode + + if rc != 0: + return { + "rc": rc, + "result": stderr.strip() if stderr else "Unknown error" + } + + query_result = [ + line.strip() for line in stdout.strip().splitlines() + if line.strip() and not line.strip().startswith("mysql:") + ] + + return { + "rc": rc, + "result": query_result + } + + except (ConfigException, OSError) as e: + return { + "rc": 1, + "result": str(e) + } + + +def delete_idrac_from_mysql( + namespace, + pod, + container, + mysqldb_name, + mysql_user, + mysql_password, + ip_to_delete, + retries=3, + delay=3 +): + """Delete a single iDRAC IP from MySQL database. + + Args: + namespace: Kubernetes namespace + pod: Pod name + container: Container name + mysqldb_name: MySQL database name + mysql_user: MySQL username + mysql_password: MySQL password + ip_to_delete: IP address to delete + retries: Number of retry attempts + delay: Delay between retries in seconds + + Returns: + dict: Result containing success status and message + """ + query = ( + f"DELETE FROM {mysqldb_name}.services " + f"WHERE ip = '{ip_to_delete}';" + ) + + for attempt in range(retries): + result = run_mysql_query_in_pod( + namespace=namespace, + pod=pod, + container=container, + mysql_user=mysql_user, + mysql_password=mysql_password, + query=query + ) + + if result.get("rc") == 0: + return { + "success": True, + "ip": ip_to_delete, + "msg": f"Successfully deleted iDRAC IP {ip_to_delete} from MySQL." + } + + if attempt < retries - 1: + time.sleep(delay) + + return { + "success": False, + "ip": ip_to_delete, + "msg": f"Failed to delete iDRAC IP {ip_to_delete} after {retries} attempts: {result.get('result')}" + } + + +def main(): + """Main function to execute the module logic.""" + module_args = { + "telemetry_namespace": {"type": "str", "required": True}, + "idrac_podnames": {"type": "list", "required": True}, + "mysqldb_k8s_name": {"type": "str", "required": True}, + "mysqldb_name": {"type": "str", "required": True}, + "mysqldb_user": {"type": "str", "required": True, "no_log": True}, + "mysqldb_password": {"type": "str", "required": True, "no_log": True}, + "ips_to_delete": {"type": "list", "required": True}, + "pod_to_db_idrac_ips": {"type": "dict", "required": True}, + "db_retries": {"type": "int", "default": 3}, + "db_delay": {"type": "int", "default": 3}, + } + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + telemetry_namespace = module.params["telemetry_namespace"] + idrac_podnames = module.params["idrac_podnames"] + mysqldb_k8s_name = module.params["mysqldb_k8s_name"] + mysqldb_name = module.params["mysqldb_name"] + mysqldb_user = module.params["mysqldb_user"] + mysqldb_password = module.params["mysqldb_password"] + ips_to_delete = module.params["ips_to_delete"] + pod_to_db_idrac_ips = module.params["pod_to_db_idrac_ips"] + db_retries = module.params["db_retries"] + db_delay = module.params["db_delay"] + + load_kube_context() + + deleted_ips = [] + failed_ips = [] + changed = False + + try: + for pod in idrac_podnames: + pod_ips = pod_to_db_idrac_ips.get(pod, []) + ips_to_delete_from_pod = list(set(pod_ips) & set(ips_to_delete)) + + if not ips_to_delete_from_pod: + module.warn(f"No IPs to delete from pod {pod}. Skipping.") + continue + + module.warn(f"Deleting IPs from pod {pod}: {ips_to_delete_from_pod}") + + for ip in ips_to_delete_from_pod: + result = delete_idrac_from_mysql( + namespace=telemetry_namespace, + pod=pod, + container=mysqldb_k8s_name, + mysqldb_name=mysqldb_name, + mysql_user=mysqldb_user, + mysql_password=mysqldb_password, + ip_to_delete=ip, + retries=db_retries, + delay=db_delay + ) + + if result.get("success"): + deleted_ips.append(ip) + changed = True + else: + failed_ips.append({ + "pod": pod, + "ip": ip, + "msg": result.get("msg", "Unknown error") + }) + + module.exit_json( + changed=changed, + deleted_ips=deleted_ips, + failed_ips=failed_ips, + msg=f"Deleted {len(deleted_ips)} iDRAC IPs from MySQL database." + ) + + except (OSError, ValueError) as e: + module.fail_json( + msg=f"An error occurred while deleting iDRAC IPs from MySQL: {str(e)}", + deleted_ips=deleted_ips, + failed_ips=failed_ips + ) + + +if __name__ == "__main__": + main() diff --git a/common/library/modules/disable_idrac_telemetry.py b/common/library/modules/disable_idrac_telemetry.py new file mode 100644 index 0000000000..cb7b885e1e --- /dev/null +++ b/common/library/modules/disable_idrac_telemetry.py @@ -0,0 +1,184 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +"""Module to disable telemetry on iDRAC nodes via Redfish API. +This module connects to iDRAC nodes and disables telemetry collection +by sending PATCH requests to the Redfish API endpoint.""" + +import requests +import urllib3 +from ansible.module_utils.basic import AnsibleModule + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +def disable_telemetry_on_idrac(idrac_ip, username, password, timeout=30): + """ + Disable telemetry on a single iDRAC node using Redfish API. + + Args: + idrac_ip: IP address of the iDRAC + username: iDRAC username + password: iDRAC password + timeout: Request timeout in seconds + + Returns: + dict: Result containing success status and message + """ + url = ( + f"https://{idrac_ip}/redfish/v1/Managers/" + f"iDRAC.Embedded.1/Attributes" + ) + + # Try different telemetry property names in order of preference + telemetry_properties = [ + "Telemetry.1.EnableTelemetry", + "TelemetryService.1.EnableTelemetry", + "Telemetry.2.EnableTelemetry", + "Redfish.1.TelemetryServiceEnabled" + ] + + headers = { + "Content-Type": "application/json" + } + + for property_name in telemetry_properties: + payload = { + "Attributes": { + property_name: "Disabled" + } + } + + try: + response = requests.patch( + url, + json=payload, + headers=headers, + auth=(username, password), + verify=False, + timeout=timeout + ) + + if response.status_code in [200, 202, 204]: + return { + "success": True, + "ip": idrac_ip, + "status_code": response.status_code, + "msg": f"Successfully disabled telemetry on iDRAC {idrac_ip} using {property_name}" + } + elif response.status_code == 400: + # Property not supported, try next one + continue + else: + return { + "success": False, + "ip": idrac_ip, + "status_code": response.status_code, + "msg": ( + f"Failed to disable telemetry on iDRAC {idrac_ip}. " + f"Status: {response.status_code}, Response: {response.text}" + ) + } + + except requests.exceptions.Timeout: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Timeout while connecting to iDRAC {idrac_ip}" + } + + except requests.exceptions.ConnectionError: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Connection error while connecting to iDRAC {idrac_ip}" + } + + except (requests.exceptions.RequestException, OSError) as e: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Error disabling telemetry on iDRAC {idrac_ip}: {str(e)}" + } + + # All properties failed + return { + "success": False, + "ip": idrac_ip, + "msg": ( + f"Failed to disable telemetry on iDRAC {idrac_ip}. " + f"None of the supported telemetry properties were found: {', '.join(telemetry_properties)}" + ) + } + + +def main(): + """Main function to execute the module logic.""" + module_args = { + "idrac_ips": {"type": "list", "required": True, "elements": "str"}, + "username": {"type": "str", "required": True, "no_log": True}, + "password": {"type": "str", "required": True, "no_log": True}, + "timeout": {"type": "int", "default": 30}, + } + + module = AnsibleModule( + argument_spec=module_args, + supports_check_mode=True + ) + + idrac_ips = module.params["idrac_ips"] + username = module.params["username"] + password = module.params["password"] + timeout = module.params["timeout"] + + disabled_ips = [] + failed_ips = [] + changed = False + + try: + for idrac_ip in idrac_ips: + result = disable_telemetry_on_idrac( + idrac_ip=idrac_ip, + username=username, + password=password, + timeout=timeout + ) + + if result.get("success"): + disabled_ips.append(idrac_ip) + changed = True + else: + failed_ips.append({ + "ip": idrac_ip, + "msg": result.get("msg", "Unknown error") + }) + + module.exit_json( + changed=changed, + disabled_ips=disabled_ips, + failed_ips=failed_ips, + msg=f"Disabled telemetry on {len(disabled_ips)} iDRAC nodes." + ) + + except (requests.exceptions.RequestException, OSError) as e: + module.fail_json( + msg=f"An error occurred while disabling telemetry: {str(e)}", + disabled_ips=disabled_ips, + failed_ips=failed_ips + ) + + +if __name__ == "__main__": + main() diff --git a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml index 8615897205..7078a2f056 100644 --- a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml +++ b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -87,6 +87,9 @@ ansible.builtin.debug: msg: "Filtered BMC IPs: {{ filtered_bmc_ip_list }}" +- name: Remove deleted nodes from telemetry (nodes not in bmc_data.csv) + ansible.builtin.include_tasks: remove_deleted_nodes.yml + - name: Convert filtered_bmc_ip_list to a dictionary with bmc_ip ansible.builtin.set_fact: filtered_bmc_ip_dict_list: "{{ filtered_bmc_ip_list | map('community.general.dict_kv', 'bmc_ip') | list }}" diff --git a/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml new file mode 100644 index 0000000000..4c82abf9e1 --- /dev/null +++ b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml @@ -0,0 +1,101 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Identify iDRAC IPs to remove (present in DB but not in bmc_data.csv) + ansible.builtin.set_fact: + ips_to_remove: "{{ db_idrac_ip_list | difference(bmc_ips) }}" + +- name: Show iDRAC IPs to be removed + ansible.builtin.debug: + msg: "iDRAC IPs to be removed: {{ ips_to_remove }}" + when: ips_to_remove | length > 0 + +- name: Skip removal if no IPs to remove + ansible.builtin.debug: + msg: "{{ no_idracips_to_remove_msg }}" + when: ips_to_remove | length == 0 + +- name: Disable telemetry on iDRAC nodes before removal + when: ips_to_remove | length > 0 + block: + - name: Disable telemetry service on iDRAC nodes + disable_idrac_telemetry: + idrac_ips: "{{ ips_to_remove }}" + username: "{{ hostvars['localhost']['bmc_username'] }}" + password: "{{ hostvars['localhost']['bmc_password'] }}" + timeout: "{{ redfish_timeout }}" + register: disable_telemetry_result + ignore_errors: true + + - name: Show successfully disabled telemetry IPs + ansible.builtin.debug: + msg: "Successfully disabled telemetry on: {{ disable_telemetry_result.disabled_ips | default([]) }}" + when: + - disable_telemetry_result.disabled_ips is defined + - disable_telemetry_result.disabled_ips | length > 0 + + - name: Show failed to disable telemetry IPs + ansible.builtin.debug: + msg: "Failed to disable telemetry on: {{ disable_telemetry_result.failed_ips | default([]) }}" + when: + - disable_telemetry_result.failed_ips is defined + - disable_telemetry_result.failed_ips | length > 0 + +- name: Remove iDRAC IPs from MySQL database + when: ips_to_remove | length > 0 + block: + - name: Delete iDRAC IPs from mysqldb + delete_idracips_from_mysqldb: + telemetry_namespace: "{{ telemetry_namespace }}" + idrac_podnames: "{{ idrac_podname_idracips.idrac_podname_ips.keys() | list }}" + mysqldb_k8s_name: "{{ mysqldb_k8s_name }}" + mysqldb_name: "{{ mysqldb_name }}" + mysqldb_user: "{{ hostvars['localhost']['mysqldb_user'] }}" + mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] }}" + ips_to_delete: "{{ ips_to_remove }}" + pod_to_db_idrac_ips: "{{ existing_pod_to_db_idrac_ips }}" + db_retries: "{{ db_retries }}" + db_delay: "{{ db_delay }}" + register: delete_idrac_result + rescue: + - name: Failed to delete iDRAC IPs from mysqldb + ansible.builtin.fail: + msg: "{{ mysqldb_delete_fail_msg }}" + +- name: Show deleted iDRAC IPs + ansible.builtin.debug: + msg: "Successfully deleted iDRAC IPs from mysqldb: {{ delete_idrac_result.deleted_ips | default([]) }}" + when: + - ips_to_remove | length > 0 + - delete_idrac_result.deleted_ips is defined + - delete_idrac_result.deleted_ips | length > 0 + +- name: Show failed to delete iDRAC IPs + ansible.builtin.debug: + msg: "Failed to delete iDRAC IPs from mysqldb: {{ delete_idrac_result.failed_ips | default([]) }}" + when: + - ips_to_remove | length > 0 + - delete_idrac_result.failed_ips is defined + - delete_idrac_result.failed_ips | length > 0 + +- name: Update telemetry report variables with deletion info + ansible.builtin.set_fact: + deleted_idrac_count: "{{ delete_idrac_result.deleted_ips | default([]) | length }}" + deleted_idrac_ips: "{{ delete_idrac_result.deleted_ips | default([]) }}" + failed_delete_count: "{{ delete_idrac_result.failed_ips | default([]) | length }}" + failed_delete_ips: "{{ delete_idrac_result.failed_ips | default([]) }}" + disabled_telemetry_count: "{{ disable_telemetry_result.disabled_ips | default([]) | length }}" + disabled_telemetry_ips: "{{ disable_telemetry_result.disabled_ips | default([]) }}" + when: ips_to_remove | length > 0 diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 index 4d8554cab3..06bf230980 100644 --- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 +++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 @@ -14,5 +14,23 @@ Telemetry not supported IPs List: - {{ item }} {% endfor %} +{% if deleted_idrac_count is defined and deleted_idrac_count | int > 0 %} +----- Node Deletion Report ----- + +Total IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }} +Removed IPs List: +{% for item in deleted_idrac_ips %} + - {{ item }} +{% endfor %} + +{% if disabled_telemetry_count is defined and disabled_telemetry_count | int > 0 %} +IPs with telemetry disabled via Redfish: {{ disabled_telemetry_count | int }} +Disabled telemetry IPs List: +{% for item in disabled_telemetry_ips %} + - {{ item }} +{% endfor %} +{% endif %} +{% endif %} + ===== Telemetry Report End ===== diff --git a/telemetry/roles/idrac_telemetry/vars/main.yml b/telemetry/roles/idrac_telemetry/vars/main.yml index d2696f4ac8..7fe6730789 100644 --- a/telemetry/roles/idrac_telemetry/vars/main.yml +++ b/telemetry/roles/idrac_telemetry/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -67,6 +67,13 @@ idrac_telemetry_statefulset_restart_failure_msg: | Failed to restart the {{ idrac_telemetry_k8s_name }} StatefulSet. Please check the logs using the command kubectl logs -n {{ telemetry_namespace }} {{ idrac_telemetry_k8s_name }}- and try again. +# Usage: remove_deleted_nodes.yml +redfish_timeout: 30 +mysqldb_delete_fail_msg: | + Failed to delete iDRAC IPs from the mysql database. + This could be due to the tables in the mysqldb not being accessible at the moment. Please try running the playbook again after some time. +no_idracips_to_remove_msg: "No iDRAC IPs to remove. All DB entries are present in bmc_data.csv." + # Usage: create_telemetry_report.yml telemetry_report_path: "/opt/omnia/telemetry/idrac_telemetry_report.yml" telemetry_report_template: "telemetry_report.j2" @@ -75,6 +82,9 @@ telemetry_report: | IP count with Telemetry not supported: {{ failed_idrac_count | int + invalid_idrac_count | int }} IP count with Telemetry activated in current execution: {{ telemetry_idrac_count | int }} + {% if deleted_idrac_count is defined %} + IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }} + {% endif %} {% if (failed_idrac_count | int + invalid_idrac_count | int) > 0 %} Potential reasons for telemetry not being initiated include Redfish connectivity problems, timeout issues, @@ -105,3 +115,15 @@ telemetry_report: | - {{ item }} {% endfor %} {% endif %} + {% if deleted_idrac_ips is defined and deleted_idrac_ips | length > 0 %} + IPs removed from telemetry database (not present in bmc_data.csv): + {% for item in deleted_idrac_ips %} + - {{ item }} + {% endfor %} + {% endif %} + {% if disabled_telemetry_ips is defined and disabled_telemetry_ips | length > 0 %} + IPs with telemetry disabled via Redfish: + {% for item in disabled_telemetry_ips %} + - {{ item }} + {% endfor %} + {% endif %} From 128cac669d133c7c6eb1f52b37b1d201e1a3810a Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Tue, 17 Feb 2026 08:37:15 +0530 Subject: [PATCH 166/172] support multiple Omnia versions (2.1.0.0, 2.1.0.1) using a single core container tag (2.1) (#3983) --- omnia.sh | 782 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 622 insertions(+), 160 deletions(-) diff --git a/omnia.sh b/omnia.sh index b7a086545d..3b320b0bf6 100755 --- a/omnia.sh +++ b/omnia.sh @@ -52,11 +52,226 @@ is_local_ip() { fi } +# Version configuration variables +OMNIA_CORE_CONTAINER_TAG="2.1" # Default container tag +OMNIA_VERSION="" # Will be read from metadata +TARGET_OMNIA_VERSION="" # Target version for upgrade +TARGET_CONTAINER_TAG="" # Target container tag for upgrade + +# Centralized version list (in chronological order) +ALL_OMNIA_VERSIONS=("2.0.0.0" "2.1.0.0") + # Container-side paths (used inside podman exec commands) CONTAINER_INPUT_DIR="/opt/omnia/input" CONTAINER_BACKUPS_DIR="/opt/omnia/backups" CONTAINER_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" +# Function to get available upgrade versions (higher than current) +get_available_upgrade_versions() { + local current_version="$1" + local available_versions=() + local version_descriptions=() + + # Find versions higher than current + local found_current=false + for version in "${ALL_OMNIA_VERSIONS[@]}"; do + if [ "$version" = "$current_version" ]; then + found_current=true + continue + fi + + if [ "$found_current" = true ]; then + available_versions+=("$version") + + # Generate description based on upgrade type + local current_tag=$(get_container_tag_from_version "$current_version") + local target_tag=$(get_container_tag_from_version "$version") + + if [ "$current_tag" = "$target_tag" ]; then + version_descriptions+=("Patch upgrade to $version (container restart only)") + else + version_descriptions+=("Major upgrade to $version (container swap required)") + fi + fi + done + + # Return arrays + printf '%s\n' "${available_versions[@]}" + printf '%s\n' "${version_descriptions[@]}" +} + +# Function to get available rollback versions (lower than current) +get_available_rollback_versions() { + local current_version="$1" + local available_versions=() + + # Find versions lower than current + for version in "${ALL_OMNIA_VERSIONS[@]}"; do + if [ "$version" = "$current_version" ]; then + break + fi + available_versions+=("$version") + done + + # Return array (reverse order for rollback - newest first) + local reversed_versions=() + for ((i=${#available_versions[@]}-1; i>=0; i--)); do + reversed_versions+=("${available_versions[$i]}") + done + + printf '%s\n' "${reversed_versions[@]}" +} + +# Function to perform same-tag rollback (container restart only) +rollback_same_tag() { + local target_version="$1" + local current_version="$2" + + echo "[INFO] [ROLLBACK] Phase: Same-Tag Rollback" + echo "[INFO] [ROLLBACK] Rolling back to $target_version within same container tag" + + # Verify container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ROLLBACK] Container is not running for same-tag rollback" + return 1 + fi + + echo "[INFO] [ROLLBACK] Updating metadata to version $target_version" + + # Update version metadata + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ROLLBACK] Failed to update metadata version" + echo "[ERROR] [ROLLBACK] Rollback failed: Could not update version metadata" + return 1 + fi + + echo "[INFO] [ROLLBACK] Restarting container to apply changes..." + + # Restart container to apply changes + if ! systemctl restart omnia_core.service; then + echo "[ERROR] [ROLLBACK] Failed to restart container service" + echo "[ERROR] [ROLLBACK] Rollback failed: Container restart failed" + return 1 + fi + + # Wait for container to be healthy after restart + echo "[INFO] [ROLLBACK] Waiting for container health check after restart (30s)" + local health_timeout=30 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ROLLBACK] Container is healthy after restart" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo "[ERROR] [ROLLBACK] Container failed to become healthy within 30 seconds after restart" + echo "[ERROR] [ROLLBACK] Rollback failed: Container health check failed" + return 1 + fi + + # Verify version update + local updated_version=$(get_current_omnia_version) + if [ "$updated_version" != "$target_version" ]; then + echo "[ERROR] [ROLLBACK] Version update verification failed" + echo "[ERROR] [ROLLBACK] Expected: $target_version, Found: $updated_version" + return 1 + fi + + echo "[INFO] [ROLLBACK] Same-tag rollback completed successfully" + echo "[INFO] [ROLLBACK] Version rolled back to: $target_version" + return 0 +} + +# Function to validate container image availability and show build instructions +validate_container_image() { + local target_version="$1" + local target_container_tag="$2" + local operation="${3:-upgrade}" + + echo -e "${BLUE}Validating target container image: omnia_core:$target_container_tag${NC}" + if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then + echo -e "${RED}ERROR: Target image missing locally: omnia_core:$target_container_tag${NC}" + echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" + echo -e "1. Clone the Omnia Artifactory repository:" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-$target_version" + echo -e "2. Navigate to the repository directory:" + echo -e " cd omnia-artifactory" + echo -e "3. Build the core image locally (loads into local Podman by default):" + echo -e " ./build_images.sh core core_tag=$target_container_tag omnia_branch=$target_version" + echo -e "Then re-run:" + echo -e " ./omnia.sh --$operation" + return 1 + fi + + echo -e "${GREEN}✓ Target image available locally: omnia_core:$target_container_tag${NC}" + return 0 +} + +# Function to get container tag from omnia version +get_container_tag_from_version() { + local version="$1" + case "$version" in + 2.0.*) + echo "1.0" + ;; + *) + echo "$(echo "$version" | awk -F. '{print $1"."$2}')" + ;; + esac +} + +# Function to read current omnia version from metadata +get_current_omnia_version() { + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + podman exec omnia_core cat /opt/omnia/.data/oim_metadata.yml 2>/dev/null | grep "omnia_version:" | awk '{print $2}' | tr -d '"' + else + echo "" + fi +} + +show_post_upgrade_instructions() { + local upgraded_version="$1" + + echo "" + echo -e "${YELLOW}================================================================================${NC}" + echo -e "${YELLOW} IMPORTANT POST-UPGRADE STEP${NC}" + echo -e "${YELLOW}================================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Omnia core container has been successfully upgraded${NC}" + echo -e "${GREEN}✓ Version updated to: $upgraded_version${NC}" + echo "" + echo -e "${BLUE}NEXT REQUIRED ACTION:${NC}" + echo -e "${YELLOW}You must now run the upgrade playbook inside the omnia_core container:${NC}" + echo "" + echo -e "${GREEN}podman exec -it omnia_core ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}" + echo "" + echo -e "${BLUE}This playbook will:${NC}" + echo -e "• Update input files" + echo -e "• Update internal configurations" + echo "" + echo -e "${YELLOW}Note: Run this command after the container is fully healthy and stable${NC}" + echo -e "${YELLOW}================================================================================${NC}" + echo "" +} + # Host-side paths (initialized dynamically after omnia_path is set) OMNIA_INPUT_DIR="" OMNIA_METADATA_DIR="" @@ -1004,29 +1219,9 @@ install_omnia_core() { local omnia_core_tag="2.1" local omnia_core_registry="" - # Check if local omnia_core:2.1 exists - if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then + # Check if local omnia_core image exists using validate function + if validate_container_image "" "$omnia_core_tag" "install"; then echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" - # Check if latest exists for backward compatibility - elif podman inspect omnia_core:latest >/dev/null 2>&1; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}" - # Tag it as 2.1 for consistency - podman tag omnia_core:latest omnia_core:${omnia_core_tag} - else - echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}" - echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}" - echo "" - echo -e "${YELLOW}One way to build the image locally:${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" - echo "" - echo -e "${YELLOW}Then re-run:${NC}" - echo -e " ./omnia.sh --install" - exit 1 fi # Check if any other containers with 'omnia' in their name are running @@ -1148,9 +1343,6 @@ install_omnia_core() { # If core container is not present else - - # Start the container setup - echo -e "${GREEN}Starting Omnia core container setup.${NC}" setup_omnia_core fi } @@ -1216,16 +1408,6 @@ phase1_validate() { return 1 fi - if [ "$previous_omnia_version" = "2.1.0.0" ]; then - echo "[ERROR] [ORCHESTRATOR] Upgrade already performed. Current Omnia version is 2.1.0.0. No further upgrade required." - return 1 - fi - - if [ "$previous_omnia_version" != "2.0.0.0" ]; then - echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version" - return 1 - fi - shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r') if [ -z "$shared_path" ]; then echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml" @@ -1244,28 +1426,6 @@ phase1_validate() { return 1 fi - current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null) - if [ -z "$current_image" ]; then - echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image" - return 1 - fi - - if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" - echo "" - echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" - echo "" - echo -e "${YELLOW}To build the core image locally:${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" - echo "" - return 1 - fi - echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed" return 0 } @@ -1277,13 +1437,18 @@ phase2_approval() { echo "============================================" echo "OMNIA UPGRADE SUMMARY" echo "============================================" - echo "Current Container Tag: 1.0" - echo "Target Container Tag: 2.1" - echo "Current Omnia Release: 2.0.0.0" - echo "Target Omnia Release: 2.1.0.0" - echo "New Features:" - echo " - Add and remove node for slurm cluster" - echo " - Additional Package Installation" + echo "Current Container Tag: $OMNIA_CORE_CONTAINER_TAG" + echo "Target Container Tag: $TARGET_CONTAINER_TAG" + echo "Current Omnia Release: $OMNIA_VERSION" + echo "Target Omnia Release: $TARGET_OMNIA_VERSION" + + # Show upgrade type + if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then + echo "Upgrade Type: Same-tag upgrade (container restart)" + else + echo "Upgrade Type: Cross-tag upgrade (container swap)" + fi + echo "============================================" current_omnia_version=$(podman exec -u root omnia_core /bin/bash -c "grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r'" 2>/dev/null) @@ -1367,6 +1532,85 @@ phase3_backup_creation() { return 0 } +phase4_same_tag_upgrade() { + local target_version="$1" + + echo "[INFO] [ORCHESTRATOR] Phase 4: Same-Tag Upgrade" + echo "[INFO] [ORCHESTRATOR] Upgrading to $target_version within same container tag" + + # Verify container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Container is not running for same-tag upgrade" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Updating metadata to version $target_version" + + # Update version metadata + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ORCHESTRATOR] Failed to update metadata version" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Restarting container to apply changes..." + + # Restart container to apply changes + if ! systemctl restart omnia_core.service; then + echo "[ERROR] [ORCHESTRATOR] Failed to restart container service" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container restart failed" + return 1 + fi + + # Wait for container to be healthy after restart + echo "[INFO] [ORCHESTRATOR] Waiting for container health check after restart (30s)" + local health_timeout=30 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ORCHESTRATOR] Container is healthy after restart" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo "[ERROR] [ORCHESTRATOR] Container failed to become healthy within 30 seconds after restart" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container health check failed" + return 1 + fi + + # Verify version update + local updated_version=$(get_current_omnia_version) + if [ "$updated_version" != "$target_version" ]; then + echo "[ERROR] [ORCHESTRATOR] Version update verification failed" + echo "[ERROR] [ORCHESTRATOR] Expected: $target_version, Found: $updated_version" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Same-tag upgrade completed successfully" + echo "[INFO] [ORCHESTRATOR] Version updated to: $target_version" + + show_post_upgrade_instructions "$target_version" + + return 0 +} + phase4_container_swap() { local quadlet_file="/etc/containers/systemd/omnia_core.container" local i @@ -1376,12 +1620,12 @@ phase4_container_swap() { if [ ! -f "$quadlet_file" ]; then echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - echo "[INFO] [ORCHESTRATOR] Stopping omnia_core 1.0 container" + echo "[INFO] [ORCHESTRATOR] Stopping omnia_core $OMNIA_CORE_CONTAINER_TAG container" systemctl stop omnia_core.service >/dev/null 2>&1 || true if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then @@ -1391,25 +1635,25 @@ phase4_container_swap() { if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop $OMNIA_CORE_CONTAINER_TAG container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit" - if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[INFO] [ORCHESTRATOR] Starting omnia_core $TARGET_CONTAINER_TAG Quadlet unit" + if ! podman inspect "omnia_core:$TARGET_CONTAINER_TAG" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:$TARGET_CONTAINER_TAG" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG image not available" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then - echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file" + if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$TARGET_CONTAINER_TAG/" "$quadlet_file"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to $TARGET_CONTAINER_TAG in quadlet file" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi @@ -1417,20 +1661,20 @@ phase4_container_swap() { systemctl daemon-reload || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 } systemctl start omnia_core.service || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start $TARGET_CONTAINER_TAG container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 } - echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)" + echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core $TARGET_CONTAINER_TAG health check (60s)" for i in $(seq 1 60); do if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then break @@ -1440,13 +1684,13 @@ phase4_container_swap() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG container failed health check" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to 2.1.0.0" + echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to $TARGET_OMNIA_VERSION" if ! podman exec -u root omnia_core bash -c " set -e if [ ! -f '$CONTAINER_METADATA_FILE' ]; then @@ -1454,14 +1698,14 @@ phase4_container_swap() { exit 1 fi if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then - sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$CONTAINER_METADATA_FILE' + sed -i 's/^omnia_version:.*/omnia_version: $TARGET_OMNIA_VERSION/' '$CONTAINER_METADATA_FILE' else - echo 'omnia_version: 2.1.0.0' >> '$CONTAINER_METADATA_FILE' + echo 'omnia_version: $TARGET_OMNIA_VERSION' >> '$CONTAINER_METADATA_FILE' fi "; then echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi @@ -1471,21 +1715,129 @@ phase4_container_swap() { } upgrade_omnia_core() { - local lock_file="/var/lock/omnia_core_upgrade.lock" - local backup_base - - if [ -e "$lock_file" ]; then - echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}" + echo -e "${BLUE}=================== Omnia Core Upgrade ====================${NC}" + echo -e "${BLUE}This script will upgrade Omnia core container.${NC}" + echo -e "${BLUE}Current version will be backed up and upgraded to target version.${NC}" + echo -e "${BLUE}=============================================================${NC}" + + # Read current version + OMNIA_VERSION=$(get_current_omnia_version) + if [ -z "$OMNIA_VERSION" ]; then + echo -e "${RED}ERROR: Could not determine current Omnia version${NC}" + echo -e "${YELLOW}Please ensure omnia_core container is running and metadata is accessible${NC}" exit 1 fi - - mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true - echo "$$" > "$lock_file" || { - echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}" + + # Get current container tag + OMNIA_CORE_CONTAINER_TAG=$(get_container_tag_from_version "$OMNIA_VERSION") + + echo -e "${GREEN}Current Omnia version: $OMNIA_VERSION${NC}" + echo -e "${GREEN}Current container tag: $OMNIA_CORE_CONTAINER_TAG${NC}" + + # Show available upgrade options + echo "" + echo "Available upgrade options:" + echo "=========================" + + # Get available upgrade versions dynamically + local upgrade_output + upgrade_output=$(get_available_upgrade_versions "$OMNIA_VERSION") + + # Parse output into versions and descriptions + local available_versions=() + local version_descriptions=() + local line_count=0 + local total_lines + + # Count total lines + total_lines=$(echo "$upgrade_output" | wc -l) + + # Split into versions and descriptions (first half = versions, second half = descriptions) + local mid_line=$((total_lines / 2)) + local line_num=0 + + while IFS= read -r line; do + line_num=$((line_num + 1)) + if [ $line_num -le $mid_line ]; then + available_versions+=("$line") + else + version_descriptions+=("$line") + fi + done <<< "$upgrade_output" + + # Check if any upgrade options are available + if [ ${#available_versions[@]} -eq 0 ]; then + echo -e "${GREEN}Already at latest version $OMNIA_VERSION${NC}" + echo "No upgrade options available." + exit 0 + fi + + # Display upgrade options + for i in "${!available_versions[@]}"; do + local target_version="${available_versions[$i]}" + local target_container_tag=$(get_container_tag_from_version "$target_version") + + # Check if target image exists locally + local image_status="✓ Available" + if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then + image_status="✗ Missing (build required)" + fi + + echo "$((i+1)). Upgrade to $target_version (container tag: $target_container_tag) [$image_status]" + done + + # Prompt user to select upgrade version + echo -n "Select upgrade option (1-${#available_versions[@]}) or press Enter to cancel: " + read -r selection + + # Validate selection + if [ -z "$selection" ]; then + echo "Upgrade cancelled by user." + exit 0 + fi + + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then + echo -e "${RED}ERROR: Invalid selection.${NC}" exit 1 - } + fi + + # Set target version based on user selection + TARGET_OMNIA_VERSION="${available_versions[$((selection-1))]}" + TARGET_CONTAINER_TAG=$(get_container_tag_from_version "$TARGET_OMNIA_VERSION") + + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then + exit 1 + fi + + echo -e "${GREEN}Target Omnia version: $TARGET_OMNIA_VERSION${NC}" + echo -e "${GREEN}Target container tag: $TARGET_CONTAINER_TAG${NC}" + + # Check if container tag change is needed + if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then + echo -e "${BLUE}Upgrade within same container tag ($TARGET_CONTAINER_TAG)${NC}" + echo -e "${BLUE}Will restart container instead of swapping${NC}" + SAME_TAG_UPGRADE=true + else + echo -e "${BLUE}Container tag change required ($OMNIA_CORE_CONTAINER_TAG -> $TARGET_CONTAINER_TAG)${NC}" + echo -e "${BLUE}Will perform full container swap${NC}" + SAME_TAG_UPGRADE=false + fi + + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then + exit 1 + fi + local lock_file="/tmp/omnia_upgrade.lock" + if [ -f "$lock_file" ]; then + echo -e "${RED}ERROR: Another upgrade process is already running${NC}" + echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}" + exit 1 + fi + touch "$lock_file" trap 'rm -f "$lock_file"' EXIT + # Run upgrade phases if ! phase1_validate; then echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" exit 1 @@ -1495,7 +1847,7 @@ upgrade_omnia_core() { exit 0 fi - backup_base="$OMNIA_UPGRADE_BACKUP_PATH" + local backup_base="$OMNIA_UPGRADE_BACKUP_PATH" if [ -z "$backup_base" ]; then echo "[ERROR] [ORCHESTRATOR] Backup path is empty" exit 1 @@ -1506,13 +1858,26 @@ upgrade_omnia_core() { exit 1 fi - if ! phase4_container_swap; then - echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" - exit 1 + # Choose upgrade path based on container tag + if [ "$SAME_TAG_UPGRADE" = "true" ]; then + if ! phase4_same_tag_upgrade "$TARGET_OMNIA_VERSION"; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in same-tag upgrade" + exit 1 + fi + else + if ! phase4_container_swap; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" + exit 1 + fi fi echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base" + + show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" + + # Initialize SSH config and start container session + init_ssh_config start_container_session exit 0 } @@ -1622,16 +1987,31 @@ restore_from_backup() { display_cleanup_instructions() { echo "" echo -e "${RED}================================================================================${NC}" - echo -e "${RED} ROLLBACK FAILED${NC}" + echo -e "${RED} UPGRADE/ROLLBACK FAILED${NC}" echo -e "${RED}================================================================================${NC}" echo "" - echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}" + echo -e "${YELLOW}Operation failed. Manual cleanup is required to restore a clean state before retrying.${NC}" + echo "" + echo -e "${BLUE}Choose the appropriate cleanup scenario:${NC}" + echo "" + echo -e "${GREEN}CASE 1: If you can log into omnia_core container:${NC}" + echo -e "${YELLOW}1. Enter omnia_core container: podman exec -it omnia_core bash${NC}" + echo -e "${YELLOW}2. Run oim cleanup: ansible-playbook /omnia/oim_cleanup.yml${NC}" + echo -e "${YELLOW}3. Run uninstall inside container: ./omnia.sh --uninstall${NC}" + echo -e "${YELLOW}4. Exit container: exit${NC}" + echo -e "${YELLOW}5. Clean shared path: rm -rf ${NC}" + echo -e "${YELLOW}6. Install required version: ./omnia.sh --install${NC}" echo "" - echo -e "${YELLOW}Run the following on the OIM host:${NC}" - echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf ${NC}" - echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}" - echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}" - echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}" + echo -e "${GREEN}CASE 2: If you cannot log into omnia_core container (but other containers are running):${NC}" + echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}" + echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}" + echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}" + echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}" + echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}" + echo -e "${YELLOW}6. Clean shared path: rm -rf ${NC}" + echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}" + echo "" + echo -e "${BLUE}Note: Replace with your actual Omnia shared path.${NC}" echo "" } @@ -1652,6 +2032,27 @@ rollback_omnia_core() { exit 1 fi + # Create lock file to prevent concurrent rollbacks + local lock_file="/tmp/omnia_rollback.lock" + if [ -f "$lock_file" ]; then + local existing_pid + existing_pid=$(cat "$lock_file" 2>/dev/null | tr -d ' \t\n\r') + + if [ -n "$existing_pid" ] && kill -0 "$existing_pid" >/dev/null 2>&1; then + echo -e "${RED}ERROR: Another rollback process is already running (PID: $existing_pid)${NC}" + echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}" + exit 1 + fi + + if [ -n "$existing_pid" ]; then + echo -e "${YELLOW}[WARN] Stale rollback lock file found (PID: $existing_pid); removing: $lock_file${NC}" + fi + rm -f "$lock_file" >/dev/null 2>&1 || true + fi + + echo "$$" > "$lock_file" + trap 'rm -f "$lock_file"' EXIT INT TERM + # Get current version if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}" @@ -1659,48 +2060,56 @@ rollback_omnia_core() { fi local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') - if [ "$current_version" != "2.1.0.0" ]; then - echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}" - exit 1 - fi - # List available backups - echo "[INFO] [ROLLBACK] Scanning for available backups..." - local backup_dirs=() + # Get available rollback versions dynamically + local rollback_versions + rollback_versions=$(get_available_rollback_versions "$current_version") + + # Convert to array + local available_versions=() while IFS= read -r line; do - backup_dirs+=("$line") - done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r) + available_versions+=("$line") + done <<< "$rollback_versions" - if [ ${#backup_dirs[@]} -eq 0 ]; then - echo -e "${RED}ERROR: No backup directories found.${NC}" + # Check if any rollback options are available + if [ ${#available_versions[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No rollback versions available from $current_version.${NC}" exit 1 fi echo "" - echo "Available backup versions:" - for i in "${!backup_dirs[@]}"; do - local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//') - local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) - echo " $((i+1)). Version $version (created: $backup_date)" + echo "Available rollback versions:" + echo "===========================" + for i in "${!available_versions[@]}"; do + local version="${available_versions[$i]}" + local container_tag=$(get_container_tag_from_version "$version") + + # Check if target image exists locally + local image_status="✓ Available" + if ! podman inspect "omnia_core:$container_tag" >/dev/null 2>&1; then + image_status="✗ Missing (build required)" + fi + + echo " $((i+1)). Rollback to version $version (container tag: $container_tag) [$image_status]" done - # Prompt for backup selection + # Prompt for rollback selection echo "" - echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " + echo -n "Select rollback version (1-${#available_versions[@]}): " read -r selection # Validate selection - if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then echo -e "${RED}ERROR: Invalid selection.${NC}" exit 1 fi - local selected_backup="${backup_dirs[$((selection-1))]}" - local backup_version=$(basename "$selected_backup" | sed 's/version_//') + local selected_version="${available_versions[$((selection-1))]}" + local selected_container_tag=$(get_container_tag_from_version "$selected_version") echo "" - echo "Selected backup: Version $backup_version" - echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: " + echo "Selected rollback: Version $selected_version" + echo -n "Are you sure you want to rollback to version $selected_version? [y/N]: " read -r confirm if [[ ! "$confirm" =~ ^[yY] ]]; then @@ -1708,50 +2117,99 @@ rollback_omnia_core() { exit 0 fi - # Validate selected backup - only check if directory exists without podman exec - if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then - # Try to check on host if container check fails - # Get shared path from metadata to check on host - local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') - local host_backup_path="${selected_backup#/opt/omnia}" - if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then - echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}" + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$selected_version" "$selected_container_tag" "rollback"; then + exit 1 + fi + + # Check if container tag change is needed + local current_container_tag=$(get_container_tag_from_version "$current_version") + if [ "$current_container_tag" = "$selected_container_tag" ]; then + echo -e "${BLUE}Rollback within same container tag ($selected_container_tag)${NC}" + echo -e "${BLUE}Will restart container instead of swapping${NC}" + + # Perform same-tag rollback (container restart only) + if ! rollback_same_tag "$selected_version" "$current_version"; then + echo "[ERROR] [ROLLBACK] Rollback failed in same-tag rollback" exit 1 fi + + echo "[INFO] [ROLLBACK] Rollback completed successfully" + echo "[INFO] [ROLLBACK] Version rolled back to: $selected_version" + exit 0 + else + echo -e "${BLUE}Container tag change required ($current_container_tag -> $selected_container_tag)${NC}" + echo -e "${BLUE}Will perform full container swap${NC}" + # Continue with existing container swap logic + fi + + # List available backups for selected version + echo "[INFO] [ROLLBACK] Scanning for available backups for version $selected_version..." + local backup_dirs=() + while IFS= read -r line; do + backup_dirs+=("$line") + done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_${selected_version}*" 2>/dev/null | sort -r) + + if [ ${#backup_dirs[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No backup directories found for version $selected_version.${NC}" + exit 1 + fi + + echo "" + echo "Available backups for version $selected_version:" + for i in "${!backup_dirs[@]}"; do + local backup_path="${backup_dirs[$i]}" + local backup_date=$(podman exec -u root omnia_core stat -c '%y' "$backup_path" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) + echo " $((i+1)). Backup created: $backup_date" + done + + # Prompt for backup selection + echo "" + echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " + read -r backup_selection + + # Validate backup selection + if ! [[ "$backup_selection" =~ ^[0-9]+$ ]] || [ "$backup_selection" -lt 1 ] || [ "$backup_selection" -gt ${#backup_dirs[@]} ]; then + echo -e "${RED}ERROR: Invalid backup selection.${NC}" + exit 1 + fi + + local selected_backup="${backup_dirs[$((backup_selection-1))]}" + + # Validate selected backup exists + if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then + echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}" + exit 1 fi echo "" echo "[INFO] [ROLLBACK] Starting rollback process..." - # Step 1: Stop 2.1 container gracefully + # Step 1: Stop current container gracefully echo "" - echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..." + echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core $current_container_tag container..." if ! stop_container_gracefully "omnia_core" 30; then echo -e "${RED}ERROR: Failed to stop container.${NC}" display_cleanup_instructions exit 1 fi - # Step 2: Check for 1.0 image + # Step 2: Update Quadlet file to use target container tag echo "" - echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..." - if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then - echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}" - echo -e "${YELLOW}Attempting to tag image...${NC}" - - # Try to tag latest as 1.0 if available - if podman inspect omnia_core:latest >/dev/null 2>&1; then - podman tag omnia_core:latest omnia_core:1.0 - else - echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}" - display_cleanup_instructions - exit 1 - fi + echo "[INFO] [ROLLBACK] Step 2: Updating Quadlet file to use container tag $selected_container_tag..." + local quadlet_file="/etc/containers/systemd/omnia_core.container" + + if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$selected_container_tag/" "$quadlet_file"; then + echo -e "${RED}ERROR: Failed to update Image to $selected_container_tag in quadlet file${NC}" + display_cleanup_instructions + exit 1 fi - # Step 3: Start 1.0 container + echo "[INFO] [ROLLBACK] Quadlet file updated to use omnia_core:$selected_container_tag" + + # Step 3: Start target container echo "" - echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..." + echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core $selected_container_tag container..." systemctl daemon-reload if ! systemctl start omnia_core.service; then echo -e "${RED}ERROR: Failed to start container service.${NC}" @@ -1805,8 +2263,8 @@ rollback_omnia_core() { echo "[INFO] [ROLLBACK] Step 7: Verifying container version..." local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') - if [ "$verify_version" != "$backup_version" ]; then - echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}" + if [ "$verify_version" != "$selected_version" ]; then + echo -e "${RED}ERROR: Version verification failed. Expected: $selected_version, Found: $verify_version${NC}" display_cleanup_instructions exit 1 fi @@ -1814,18 +2272,22 @@ rollback_omnia_core() { # Audit log end local rollback_end=$(date -Iseconds) echo "[AUDIT] Rollback operation completed at: $rollback_end" - echo "[AUDIT] Rolled back from version $current_version to $backup_version" + echo "[AUDIT] Rolled back from version $current_version to $selected_version" echo "" echo -e "${GREEN}================================================================================${NC}" echo -e "${GREEN} ROLLBACK COMPLETED SUCCESSFULLY${NC}" echo -e "${GREEN}================================================================================${NC}" echo "" - echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}" + echo -e "${GREEN}✓ Omnia core has been rolled back to version $selected_version${NC}" echo -e "${GREEN}✓ Container is running and healthy${NC}" echo -e "${GREEN}✓ Configuration restored from backup${NC}" echo "" + # Clean up lock file before starting long-running ssh session + rm -f "$lock_file" >/dev/null 2>&1 || true + echo "[INFO] Rollback lock file removed before starting container session" + # Initialize SSH config and start container session init_ssh_config start_container_session From 2078496e82aa5525bfc6255373f8f42ca4a51fa2 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 17 Feb 2026 12:35:42 +0530 Subject: [PATCH 167/172] LDMS Slurm node add /delete (#3976) * LDMS slurm node add/delete * pr review comments update --- .../telemetry/tasks/check_pxe_changes.yml | 88 ++++++++++ discovery/roles/telemetry/tasks/main.yml | 10 ++ .../telemetry/tasks/restart_ldms_configs.yml | 151 ++++++++++++++++++ discovery/roles/telemetry/vars/main.yml | 21 +++ 4 files changed, 270 insertions(+) create mode 100644 discovery/roles/telemetry/tasks/check_pxe_changes.yml create mode 100644 discovery/roles/telemetry/tasks/restart_ldms_configs.yml diff --git a/discovery/roles/telemetry/tasks/check_pxe_changes.yml b/discovery/roles/telemetry/tasks/check_pxe_changes.yml new file mode 100644 index 0000000000..398c831961 --- /dev/null +++ b/discovery/roles/telemetry/tasks/check_pxe_changes.yml @@ -0,0 +1,88 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if current PXE mapping file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + delegate_to: localhost + register: current_pxe_file + +- name: Check if backup PXE mapping file exists + ansible.builtin.stat: + path: "{{ backup_pxe_mapping_ldms_path }}" + delegate_to: localhost + register: backup_pxe_file + +- name: Handle first discovery run (no backup exists) + when: + - current_pxe_file.stat.exists + - not backup_pxe_file.stat.exists + block: + - name: Create backup of PXE mapping file + ansible.builtin.copy: + src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + dest: "{{ backup_pxe_mapping_ldms_path }}" + remote_src: true + mode: preserve + delegate_to: localhost + + - name: Set pxe_changed to false for first run + ansible.builtin.set_fact: + pxe_changed: false + + - name: Display first run message + ansible.builtin.debug: + msg: "{{ pxe_first_run_msg }}" + +- name: Compare PXE mapping files when backup exists + when: + - current_pxe_file.stat.exists + - backup_pxe_file.stat.exists + block: + - name: Get checksum of current PXE mapping file + ansible.builtin.stat: + path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + checksum_algorithm: sha256 + delegate_to: localhost + register: current_pxe_checksum + + - name: Get checksum of backup PXE mapping file + ansible.builtin.stat: + path: "{{ backup_pxe_mapping_ldms_path }}" + checksum_algorithm: sha256 + delegate_to: localhost + register: backup_pxe_checksum + + - name: Set pxe_changed based on checksum comparison + ansible.builtin.set_fact: + pxe_changed: "{{ current_pxe_checksum.stat.checksum != backup_pxe_checksum.stat.checksum }}" + + - name: Update backup PXE mapping file when changed + ansible.builtin.copy: + src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + dest: "{{ backup_pxe_mapping_ldms_path }}" + remote_src: true + mode: preserve + delegate_to: localhost + when: pxe_changed | bool + + - name: Display PXE change status + ansible.builtin.debug: + msg: "{{ pxe_changed_msg if (pxe_changed | bool) else pxe_no_change_msg }}" + +- name: Set pxe_changed to false when PXE file is missing + ansible.builtin.set_fact: + pxe_changed: false + when: not current_pxe_file.stat.exists diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml index 825c3988d7..e4e3d1846a 100644 --- a/discovery/roles/telemetry/tasks/main.yml +++ b/discovery/roles/telemetry/tasks/main.yml @@ -55,3 +55,13 @@ - name: Update ldms agg configuration ansible.builtin.include_tasks: update_ldms_agg_config.yml when: hostvars['localhost']['ldms_support'] + +- name: Check if PXE mapping has changed since last run + ansible.builtin.include_tasks: check_pxe_changes.yml + when: hostvars['localhost']['ldms_support'] + +- name: Restart LDMS configs for node addition and deletion + ansible.builtin.include_tasks: restart_ldms_configs.yml + when: + - hostvars['localhost']['ldms_support'] + - pxe_changed | default(false) | bool diff --git a/discovery/roles/telemetry/tasks/restart_ldms_configs.yml b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml new file mode 100644 index 0000000000..0a176118f0 --- /dev/null +++ b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml @@ -0,0 +1,151 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Load high availability config + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/high_availability_config.yml" + name: ha_config + +- name: Set kube_vip fact + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + +- name: Test SSH connectivity to kube VIP only when PXE has changed + when: + - kube_vip | length > 0 + - pxe_changed | default(false) | bool + block: + - name: SSH test to kube VIP + ansible.builtin.command: + cmd: "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes {{ kube_vip }} echo reachable" + delegate_to: localhost + register: kube_vip_ssh_check + changed_when: false + + - name: Set kube VIP reachable fact + ansible.builtin.set_fact: + kube_vip_reachable: "{{ kube_vip_ssh_check.rc == 0 }}" + + rescue: + - name: Display kube VIP unreachable message + ansible.builtin.debug: + msg: "{{ kube_vip_unreachable_msg }}" + + - name: Set kube VIP reachable fact to false + ansible.builtin.set_fact: + kube_vip_reachable: false + +- name: Restart LDMS aggregator when PXE has changed + when: pxe_changed | default(false) | bool + block: + - name: Check if LDMS aggregator is running on service k8s cluster + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: StatefulSet + name: nersc-ldms-aggr + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + register: ldms_statefulset_info + failed_when: false + when: + - kube_vip_reachable | bool + + - name: Set LDMS running state + ansible.builtin.set_fact: + ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}" + when: + - kube_vip_reachable | bool + + - name: Check if LDMS conf ConfigMap file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml" + register: ldms_conf_file + when: ldms_running | default(false) | bool + + - name: Check if LDMS bin ConfigMap file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml" + register: ldms_bin_file + when: ldms_running | default(false) | bool + + - name: Apply LDMS configuration ConfigMap + kubernetes.core.k8s: + state: present + src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml" + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + + - name: Apply LDMS scripts ConfigMap + kubernetes.core.k8s: + state: present + src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml" + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_bin_file.stat.exists | default(false) + + - name: Restart LDMS aggregator StatefulSet + kubernetes.core.k8s: + state: present + definition: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: nersc-ldms-aggr + namespace: "{{ telemetry_namespace }}" + spec: + template: + metadata: + annotations: + kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) + + - name: Wait for LDMS aggregator pod to be ready after restart + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ telemetry_namespace }}" + label_selectors: + - "app=nersc-ldms-aggr" + wait: true + wait_condition: + type: Ready + status: "True" + wait_timeout: 120 + delegate_to: "{{ kube_vip }}" + register: ldms_pod_ready + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) + + - name: Display LDMS aggregator restart status + ansible.builtin.debug: + msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}" + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml index 5c5838ce29..69b0c0c0ac 100644 --- a/discovery/roles/telemetry/vars/main.yml +++ b/discovery/roles/telemetry/vars/main.yml @@ -252,3 +252,24 @@ common_templates: skip_when: "{{ cluster_id_present | default(false) }}" - src: 'telemetry/kustomization.yaml.j2' dest: 'kustomization.yaml' + +# Usage: check_pxe_changes.yml +backup_pxe_mapping_ldms_path: "/opt/omnia/telemetry/backup_pxe_mapping_ldms.csv" +pxe_first_run_msg: "First discovery run detected. Saving PXE mapping backup. LDMS restart not required." +pxe_no_change_msg: "PXE mapping file has not changed since last run. Skipping LDMS restart." +pxe_changed_msg: "PXE mapping file has changed. LDMS restart will be triggered." + +# Usage: restart_ldms_configs.yml +kube_vip_unreachable_msg: >- + Kube VIP ({{ kube_vip }}) is not reachable via SSH. + There might be issues with the k8s cluster. + LDMS aggregator restart will be skipped. + + After discovery completes, manually restart the LDMS aggregator pod with: + + ssh {{ kube_vip }} + kubectl rollout restart statefulset nersc-ldms-aggr -n {{ telemetry_namespace }} + kubectl get pods -n {{ telemetry_namespace }} -l app=nersc-ldms-aggr -w + +ldms_pod_ready_msg: "LDMS aggregator pod is ready." +ldms_pod_not_ready_msg: "WARNING: LDMS aggregator pod did not become ready within 120s." From 4dbc6a978fdbcbd74c7a7c62e75ab47c399784be Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Wed, 18 Feb 2026 07:32:41 +0000 Subject: [PATCH 168/172] mask docker credentials in local_repo logs Signed-off-by: Vrinda_Marwah --- .../library/module_utils/local_repo/parse_and_download.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 15bed1efb3..d5192e2bbe 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -84,16 +84,16 @@ def execute_command(cmd_string, logger, type_json=False): logger.error(f"Raw output was: {status['stdout']}") return False - logger.info(f"Command succeeded: {cmd_string}") + logger.info(f"Command succeeded: {safe_cmd_string}") return status except subprocess.CalledProcessError as e: - logger.error(f"Command failed: {cmd_string} - {e}") + logger.error(f"Command failed: {safe_cmd_string} - {e}") return False except subprocess.TimeoutExpired as e: - logger.error(f"Command timed out: {cmd_string} - {e}") + logger.error(f"Command timed out: {safe_cmd_string} - {e}") return False except OSError as e: - logger.error(f"OS error during command: {cmd_string} - {e}") + logger.error(f"OS error during command: {safe_cmd_string} - {e}") return False finally: From 76d7f3cd0c9c77fd0467a18249be12edc4236b34 Mon Sep 17 00:00:00 2001 From: Nethravathi M G <146437298+nethramg@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:04:12 +0530 Subject: [PATCH 169/172] Removing the IP's from the Activated IP list (#3992) --- telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 index 06bf230980..54986f418f 100644 --- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 +++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 @@ -2,9 +2,9 @@ ----- Telemetry Report for Cluster ----- -Total IP count with Telemetry activated: {{ (db_idrac_ip_list | length) + (telemetry_idrac | length) }} +Total IP count with Telemetry activated: {{ ((db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([]))) | length }} Telemetry activated IPs List: -{% for item in db_idrac_ip_list + telemetry_idrac %} +{% for item in (db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([])) %} - {{ item }} {% endfor %} From 272bfb51c94fe7283bc3256c32894882b7b032e8 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Thu, 19 Feb 2026 14:41:35 +0530 Subject: [PATCH 170/172] Fix for local_repo.yml allows passes even with invalid package names in JSON files. Signed-off-by: pullan1 --- .../library/module_utils/local_repo/config.py | 6 +- .../local_repo/container_repo_utils.py | 161 ++++++++++-------- .../module_utils/local_repo/download_rpm.py | 89 +++++++++- 3 files changed, 178 insertions(+), 78 deletions(-) diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index a731c8528d..7bfea4b301 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -64,6 +64,10 @@ "x86_64": ["dnf", "download", "--resolve", "--alldeps", "--arch=x86_64,noarch"], "aarch64": ["dnf", "download", "--forcearch", "aarch64", "--resolve", "--alldeps", "--exclude=*.x86_64"] } +DNF_INFO_COMMANDS = { + "x86_64": ["dnf", "info", "--quiet"], + "aarch64": ["dnf", "info", "--quiet", "--forcearch=aarch64"] +} # ---------------------------- # Used by download_common.py @@ -222,7 +226,7 @@ # Naming convention: _omnia-additional to match existing filter patterns # ---------------------------- ADDITIONAL_REPOS_KEY = "additional_repos" -AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional-repo" +AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional" AGGREGATED_REMOTE_NAME_TEMPLATE = "{arch}_omnia-additional-{name}" AGGREGATED_DISTRIBUTION_NAME_TEMPLATE = "{arch}_omnia-additional" AGGREGATED_BASE_PATH_TEMPLATE = "opt/omnia/offline_repo/cluster/{arch}/rhel/10.0/rpms/omnia-additional" diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py index 0a4abb35fb..e3f47869af 100644 --- a/common/library/module_utils/local_repo/container_repo_utils.py +++ b/common/library/module_utils/local_repo/container_repo_utils.py @@ -13,6 +13,13 @@ # limitations under the License. #pylint: disable=import-error,no-name-in-module +""" +Container repository utilities for Pulp operations. + +This module provides functions for creating, syncing, and managing +container repositories and distributions in Pulp. +""" + import multiprocessing from ansible.module_utils.local_repo.parse_and_download import execute_command from ansible.module_utils.local_repo.config import ( @@ -114,109 +121,119 @@ def sync_container_repository(repo_name, remote_name, package_content, logger, t logger.info(f"Getting repository version before sync for {repo_name}") verify_command = pulp_container_commands["show_container_repo"] % repo_name verify_result_before = execute_command(verify_command, logger, type_json=True) - + version_before = None - if verify_result_before and isinstance(verify_result_before, dict) and "stdout" in verify_result_before: + if (verify_result_before and isinstance(verify_result_before, dict) and + "stdout" in verify_result_before): repo_data_before = verify_result_before["stdout"] if isinstance(repo_data_before, dict): version_before = repo_data_before.get("latest_version_href") logger.info(f"Repository version before sync: {version_before}") - + command = pulp_container_commands["sync_container_repository"] % (repo_name, remote_name) result = execute_command(command,logger) if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): logger.error(f"Sync command failed for repository {repo_name}") return False - + logger.info(f"Validating sync result for repository {repo_name}") verify_result_after = execute_command(verify_command, logger, type_json=True) - - if verify_result_after and isinstance(verify_result_after, dict) and "stdout" in verify_result_after: + + if (verify_result_after and isinstance(verify_result_after, dict) and + "stdout" in verify_result_after): repo_data_after = verify_result_after["stdout"] if isinstance(repo_data_after, dict): version_after = repo_data_after.get("latest_version_href") logger.info(f"Repository version after sync: {version_after}") - + if not version_after or version_after.endswith("/versions/0/"): logger.error(f"Sync completed but no content was downloaded for {repo_name}. " f"The specified image tag likely does not exist in the upstream registry.") return False - + if version_before and version_after and version_before == version_after: # Check if tag actually exists using precise Pulp commands try: # Step 1: Get distribution to find repository href dist_command = f"pulp container distribution show --name {repo_name}" dist_result = execute_command(dist_command, logger, type_json=True) - + if not dist_result or not isinstance(dist_result, dict) or "stdout" not in dist_result: - logger.error(f"Failed to get distribution info for {repo_name}. Assuming tag doesn't exist.") - return False - - dist_data = dist_result["stdout"] - if not isinstance(dist_data, dict) or "repository" not in dist_data: - logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.") - return False - - repo_href = dist_data["repository"] - logger.info(f"Found repository href: {repo_href}") - - # Step 2: Get repository version href - repo_command = f"pulp container repository show --href {repo_href}" - repo_result = execute_command(repo_command, logger, type_json=True) - - if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result: - logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.") - return False - - repo_data = repo_result["stdout"] - if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data: - logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.") - return False - - repo_ver_href = repo_data["latest_version_href"] - logger.info(f"Found repository version href: {repo_ver_href}") - - # Step 3: Check if tag exists in content - tags_command = f"pulp show --href '/pulp/api/v3/content/container/tags/?repository_version={repo_ver_href}'" - tags_result = execute_command(tags_command, logger, type_json=True) - - if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result: - logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.") - return False - - tags_data = tags_result["stdout"] - if not isinstance(tags_data, dict) or "results" not in tags_data: - logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.") - return False - - tags = tags_data["results"] - tag_exists = False - - # Use the tag parameter if provided, otherwise fall back to checking package_content - tag_to_check = tag if tag else package_content - - for tag_item in tags: - if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check: - tag_exists = True - break - - if tag_exists: - logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.") + logger.info(f"Distribution {repo_name} does not exist yet - skipping tag validation, will create distribution") + # Skip tag validation but continue to create distribution at line 221 else: - logger.error(f"Sync completed but repository version did not change for {repo_name}. " - f"Version remained at {version_after}. " - f"Tag '{tag_to_check}' does not exist in Pulp repository content. " - f"This indicates the tag likely does not exist in the upstream registry.") - return False + # Distribution exists, validate the tag + dist_data = dist_result["stdout"] + if not isinstance(dist_data, dict) or "repository" not in dist_data: + logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.") + return False + repo_href = dist_data["repository"] + logger.info(f"Found repository href: {repo_href}") + + # Step 2: Get repository version href + repo_command = f"pulp container repository show --href {repo_href}" + repo_result = execute_command(repo_command, logger, type_json=True) + + if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result: + logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_data = repo_result["stdout"] + if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data: + logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_ver_href = repo_data["latest_version_href"] + logger.info(f"Found repository version href: {repo_ver_href}") + + # Step 3: Check if tag exists in content + tags_command = ( + f"pulp show --href " + f"'/pulp/api/v3/content/container/tags/" + f"?repository_version={repo_ver_href}'" + ) + tags_result = execute_command(tags_command, logger, type_json=True) + + if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result: + logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags_data = tags_result["stdout"] + if not isinstance(tags_data, dict) or "results" not in tags_data: + logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags = tags_data["results"] + tag_exists = False + + # Use the tag parameter if provided, otherwise fall back to checking package_content + tag_to_check = tag if tag else package_content + + for tag_item in tags: + if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check: + tag_exists = True + break + + if tag_exists: + logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.") + else: + logger.error(f"Sync completed but repository version did not change for {repo_name}. " + f"Version remained at {version_after}. " + f"Tag '{tag_to_check}' does not exist in Pulp repository content. " + f"This indicates the tag likely does not exist in the upstream registry.") + return False except Exception as e: - logger.error(f"Error checking repository tag existence: {e}. Assuming tag doesn't exist.") + logger.error( + f"Error checking repository tag existence: {e}. Assuming tag doesn't exist." + ) return False - - logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}") - - result = create_container_distribution(repo_name,package_content,logger) + + logger.info( + f"Sync validation successful: repository {repo_name} version changed " + f"from {version_before} to {version_after}" + ) + result = create_container_distribution(repo_name, package_content, logger) return result except Exception as e: logger.error(f"Failed to synchronize repository {repo_name} with remote {remote_name}. Error: {e}") diff --git a/common/library/module_utils/local_repo/download_rpm.py b/common/library/module_utils/local_repo/download_rpm.py index 95b354dd6b..44b56c1799 100644 --- a/common/library/module_utils/local_repo/download_rpm.py +++ b/common/library/module_utils/local_repo/download_rpm.py @@ -20,7 +20,8 @@ import shutil from pathlib import Path from ansible.module_utils.local_repo.config import ( - DNF_COMMANDS + DNF_COMMANDS, + DNF_INFO_COMMANDS ) from multiprocessing import Lock from ansible.module_utils.local_repo.parse_and_download import write_status_to_file @@ -95,11 +96,30 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, for pkg in rpm_list: # Get repo_name for this specific RPM from mapping pkg_repo_name = repo_mapping.get(pkg, "") - if any(pkg in line and ".rpm" in line for line in stdout_lines + stderr_lines): + # Check if package was downloaded successfully + # Look for "Already downloaded" or actual .rpm file in output + pkg_downloaded = False + for line in stdout_lines + stderr_lines: + if pkg in line and (".rpm" in line or "Already downloaded" in line): + pkg_downloaded = True + break + + # Also check for "No match for argument" or "No package" errors + pkg_not_found = False + for line in stderr_lines: + if pkg in line and ("No match for argument" in line or + "No package" in line or + "not found" in line.lower()): + pkg_not_found = True + break + + if pkg_downloaded and not pkg_not_found: downloaded.append(pkg) write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) else: failed.append(pkg) + if pkg_not_found: + logger.warning(f"Package '{pkg}' not found in configured repositories") # Retry failed ones individually if failed: @@ -110,6 +130,15 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, # Get repo_name for this specific RPM from mapping pkg_repo_name = repo_mapping.get(pkg, "") + # Check for package not found errors + retry_stderr = retry_res.stderr.lower() + pkg_invalid = any(err in retry_stderr for err in [ + "no match for argument", + "no package", + "not found", + "unable to find a match" + ]) + if retry_res.returncode == 0 and ".rpm" in retry_res.stdout + retry_res.stderr: downloaded.append(pkg) failed.remove(pkg) @@ -117,7 +146,10 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, logger.info(f"Package '{pkg}' downloaded successfully on retry.") else: write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name) - logger.error(f"Package '{pkg}' still failed after retry.") + if pkg_invalid: + logger.error(f"Package '{pkg}' does not exist in configured repositories.") + else: + logger.error(f"Package '{pkg}' still failed after retry.") # Determine final status if not failed: @@ -128,12 +160,59 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, status = "Failed" else: - status = "Success" logger.info("RPM won't be downloaded when repo_config is partial or never") + logger.info("Validating package availability using dnf info...") + + arch_key = "x86_64" if arc.lower() in ("x86_64") else "aarch64" + valid_packages = [] + invalid_packages = [] + for pkg in package["rpm_list"]: + # Validate package using dnf info + dnf_info_command = DNF_INFO_COMMANDS[arch_key] + [ + "--repo=*", # Search all enabled repositories + pkg + ] + result = subprocess.run( + dnf_info_command, + check=False, + capture_output=True, + text=True + ) # Get repo_name for this specific RPM from mapping pkg_repo_name = repo_mapping.get(pkg, "") - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) + if result.returncode == 0: + # Package exists and is available + valid_packages.append(pkg) + write_status_to_file( + status_file_path, pkg, "rpm", "Success", + logger, file_lock, pkg_repo_name + ) + logger.info(f"Package '{pkg}' validated successfully") + else: + # Package not found or invalid + invalid_packages.append(pkg) + write_status_to_file( + status_file_path, pkg, "rpm", "Failed", + logger, file_lock, pkg_repo_name + ) + logger.error( + f"Package '{pkg}' validation failed. " + f"Package may not exist in configured repositories." + ) + + # Determine final status based on validation results + if not invalid_packages: + status = "Success" + elif valid_packages: + status = "Partial" + else: + status = "Failed" + + logger.info( + f"Validation complete - Valid: {len(valid_packages)}, " + f"Invalid: {len(invalid_packages)}" + ) except Exception as e: logger.error(f"Exception occurred: {e}") From c42782c8481703c5d0c10ba3e36ee7e242bd0304 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 20 Feb 2026 17:54:24 +0530 Subject: [PATCH 171/172] Lock Mechanism for Upgrade Sequence Integrity (#3994) --- build_image_aarch64/build_image_aarch64.yml | 3 + build_image_x86_64/build_image_x86_64.yml | 3 + discovery/discovery.yml | 3 + local_repo/local_repo.yml | 3 + omnia.sh | 82 +++++++++++++------ prepare_oim/prepare_oim.yml | 3 + .../tasks/display_warnings.yml | 2 + upgrade/upgrade_omnia.yml | 10 +++ .../get_config_credentials.yml | 4 + utils/oim_cleanup.yml | 4 + utils/upgrade_checkup.yml | 33 ++++++++ 11 files changed, 126 insertions(+), 24 deletions(-) create mode 100644 utils/upgrade_checkup.yml diff --git a/build_image_aarch64/build_image_aarch64.yml b/build_image_aarch64/build_image_aarch64.yml index 08ee0b4ad8..d5dc76a82d 100644 --- a/build_image_aarch64/build_image_aarch64.yml +++ b/build_image_aarch64/build_image_aarch64.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml index 676d8adbd6..8f56b86ef6 100644 --- a/build_image_x86_64/build_image_x86_64.yml +++ b/build_image_x86_64/build_image_x86_64.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 75efadb47c..40fd00123c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../utils/include_input_dir.yml diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml index 3a743c3f47..963715b5e3 100644 --- a/local_repo/local_repo.yml +++ b/local_repo/local_repo.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/omnia.sh b/omnia.sh index 3b320b0bf6..530c168e7d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -299,7 +299,18 @@ update_metadata_upgrade_backup_dir() { " } - +# Resolve the upgrade guard lock path (container or host shared path) +get_upgrade_guard_lock_path() { + local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" + local upgrade_guard_lock_host + upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -n "$upgrade_guard_lock_host" ]; then + upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" + else + upgrade_guard_lock_host="$upgrade_guard_lock_container" + fi + echo "$upgrade_guard_lock_host" +} check_internal_nfs_export() { nfs_server_ip=$1 @@ -398,6 +409,11 @@ cleanup_omnia_core() { # Fetch the configuration from the Omnia core container. fetch_config + # Clear upgrade guard lock if present (shared path visible to container and host) + local upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_path" + # Remove the container remove_container @@ -1837,6 +1853,16 @@ upgrade_omnia_core() { touch "$lock_file" trap 'rm -f "$lock_file"' EXIT + # Create upgrade guard lock in shared path so other playbooks can block during upgrade + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + + mkdir -p "$(dirname "$upgrade_guard_lock_path")" 2>/dev/null || true + echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_path" || { + echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_path${NC}" + exit 1 + } + # Run upgrade phases if ! phase1_validate; then echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" @@ -1874,8 +1900,10 @@ upgrade_omnia_core() { echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base" + # Seed inputs and defaults after upgrade + post_setup_config + show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" - # Initialize SSH config and start container session init_ssh_config start_container_session @@ -1885,15 +1913,15 @@ upgrade_omnia_core() { # Validate backup directory structure and files validate_backup_directory() { local backup_path="$1" - + echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path" - + # Check if backup directory exists if ! podman exec -u root omnia_core test -d "$backup_path"; then echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path" return 1 fi - + # Check for required subdirectories for subdir in input metadata configs; do if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then @@ -1901,24 +1929,24 @@ validate_backup_directory() { return 1 fi done - + # Check for required files if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml" return 1 fi - + if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container" return 1 fi - + # Verify metadata contains version information if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then echo "[ERROR] [ROLLBACK] Metadata file does not contain version information" return 1 fi - + echo "[INFO] [ROLLBACK] Backup validation successful" return 0 } @@ -1927,15 +1955,15 @@ validate_backup_directory() { stop_container_gracefully() { local container_name="$1" local timeout="${2:-30}" - + echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..." - + # Try graceful stop first if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then echo "[INFO] [ROLLBACK] Container stopped gracefully" return 0 fi - + # Check if container is still running if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..." @@ -1947,16 +1975,16 @@ stop_container_gracefully() { return 1 fi fi - + return 0 } # Restore files from backup restore_from_backup() { local backup_path="$1" - + echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path" - + # Restore input files if ! podman exec -u root omnia_core bash -c " set -e @@ -1966,19 +1994,19 @@ restore_from_backup() { echo "[ERROR] [ROLLBACK] Failed to restore input files" return 1 fi - + # Restore metadata if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then echo "[ERROR] [ROLLBACK] Failed to restore metadata" return 1 fi - + # Restore container config on host if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then echo "[ERROR] [ROLLBACK] Failed to restore container config" return 1 fi - + echo "[INFO] [ROLLBACK] Files restored successfully" return 0 } @@ -2006,8 +2034,8 @@ display_cleanup_instructions() { echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}" echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}" echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}" - echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}" - echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}" + echo -e "${YELLOW}4. Stop all containers: podman stop $(podman ps -aq)${NC}" + echo -e "${YELLOW}5. Remove all containers: podman rm -f $(podman ps -aq)${NC}" echo -e "${YELLOW}6. Clean shared path: rm -rf ${NC}" echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}" echo "" @@ -2015,7 +2043,6 @@ display_cleanup_instructions() { echo "" } -# Main rollback function rollback_omnia_core() { echo -e "${GREEN}================================================================================${NC}" echo -e "${GREEN} OMNIA CORE ROLLBACK${NC}" @@ -2287,7 +2314,14 @@ rollback_omnia_core() { # Clean up lock file before starting long-running ssh session rm -f "$lock_file" >/dev/null 2>&1 || true echo "[INFO] Rollback lock file removed before starting container session" - + + # Clear upgrade guard lock if it exists (shared path visible to container and host) + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_path" + # Initialize SSH config and start container session init_ssh_config start_container_session @@ -2325,4 +2359,4 @@ main() { } # Call the main function -main "$1" +main "$1" \ No newline at end of file diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 50c48fd3e5..f5ea607994 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml index 2cc6dfed26..444869291b 100644 --- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -29,6 +29,7 @@ - name: Pause for user to review warnings ansible.builtin.pause: + seconds: 30 prompt: | ╔════════════════════════════════════════════╗ ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ @@ -42,6 +43,7 @@ Please review these warnings carefully. Press ENTER to continue or CTRL+C to abort. + Continuing automatically in 30 seconds... when: - upgrade_warnings is defined - upgrade_warnings | length > 0 diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml index 61050ec244..ade6b1f173 100644 --- a/upgrade/upgrade_omnia.yml +++ b/upgrade/upgrade_omnia.yml @@ -18,3 +18,13 @@ - name: Upgrade cluster tasks ansible.builtin.import_playbook: upgrade_cluster.yml + +- name: Clear upgrade guard lock + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Remove upgrade guard lock + ansible.builtin.file: + path: /opt/omnia/.data/upgrade_in_progress.lock + state: absent diff --git a/utils/credential_utility/get_config_credentials.yml b/utils/credential_utility/get_config_credentials.yml index 0e4c323b94..b77ba14b9b 100644 --- a/utils/credential_utility/get_config_credentials.yml +++ b/utils/credential_utility/get_config_credentials.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../include_input_dir.yml diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml index edb9cfb207..4d959d5ea4 100644 --- a/utils/oim_cleanup.yml +++ b/utils/oim_cleanup.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: include_input_dir.yml diff --git a/utils/upgrade_checkup.yml b/utils/upgrade_checkup.yml new file mode 100644 index 0000000000..5fb8582000 --- /dev/null +++ b/utils/upgrade_checkup.yml @@ -0,0 +1,33 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: "Guard: block if upgrade is in progress" + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Check upgrade lock file + ansible.builtin.stat: + path: /opt/omnia/.data/upgrade_in_progress.lock + register: upgrade_lock + + - name: Block playbook while upgrade is in progress + ansible.builtin.fail: + msg: >- + Upgrade is not completed fully. + Please run upgrade_omnia.yml to complete upgrade before running any other playbook using the below command: + "ansible-playbook /omnia/upgrade/upgrade_omnia.yml" + If you don't require input files to be migrated, reconfigure the default input files, remove the lock file using the following command + "rm /opt/omnia/.data/upgrade_in_progress.lock" and then proceed. + when: upgrade_lock.stat.exists From d11fde8e868837f3c5403bd3b55f36b72ee60ae5 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Fri, 20 Feb 2026 18:14:34 +0530 Subject: [PATCH 172/172] Slurm delete node - drain node before delete - skip_merge new option (#3986) * Node drain logic for deletion * Shell instead of command for piping * lint fixes * Updated permission for slurmdbd Added new force_conf option for allowing confs pass through validation * removede new file * Renamed force_conf to skip_merge --- .../input_validation/schema/omnia_config.json | 4 + .../validation_flows/common_validation.py | 9 +- .../slurm_config/tasks/build_slurm_conf.yml | 5 + .../slurm_config/tasks/check_ctld_running.yml | 12 +- discovery/roles/slurm_config/tasks/confs.yml | 14 ++- .../slurm_config/tasks/create_slurm_dir.yml | 1 + .../tasks/drain_and_remove_node.yml | 109 ++++++++++++++++++ .../roles/slurm_config/tasks/remove_node.yml | 2 +- discovery/roles/slurm_config/vars/main.yml | 6 +- input/omnia_config.yml | 10 ++ 10 files changed, 161 insertions(+), 11 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/drain_and_remove_node.yml diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index f53485770f..ca7266124c 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -19,6 +19,10 @@ "minLength": 1, "description": "Name of the nfs storage in storage_config.yml" }, + "skip_merge": { + "type": "boolean", + "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" + }, "config_sources": { "type": "object", "description": "Config can be a file path or inline mapping", diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index f577a4e9b8..36f55130d4 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1074,9 +1074,12 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") - for cfg_path_dict in cnfg_src: + cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')] + for idx, cfg_path_dict in enumerate(cnfg_src): + skip_merge = skip_merge_list[idx] for k,v in cfg_path_dict.items(): conf_dict = None if isinstance(v, str): @@ -1086,7 +1089,7 @@ def validate_omnia_config( f"provided conf path for {k} - {v} does not exist")) continue else: # path exists - if not skip_conf_validation: + if not skip_merge and not skip_conf_validation: conf_dict, duplicate_keys = parse_slurm_conf(v, k, False) if duplicate_keys: errors.append( diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index 9d5d0f0944..40b6137172 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Read NodeName parameters from iDRAC + ansible.builtin.include_tasks: read_node_idrac.yml + when: cmpt_list + loop: "{{ cmpt_list }}" + - name: Append node_params list into NodeName list ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 7d908169ab..ce27d3c362 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -22,6 +22,16 @@ register: ssh_check ignore_errors: true +- name: Drain and remove nodes if any + ansible.builtin.include_tasks: drain_and_remove_node.yml + loop: "{{ nodes_in_normal_not_in_cmpt }}" + loop_control: + loop_var: node_to_remove + when: + - ssh_check is success + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + - name: Enter slurm controller when pingable when: - ssh_check is success @@ -37,7 +47,7 @@ register: service_facts ignore_unreachable: true - - name: Fail if slurmctld is unreachable + - name: Check slurmctld is reachable ansible.builtin.fail: msg: "Failed to connect to {{ ctld }}." when: service_facts is unreachable diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index c5f7953b0d..1e5a4e507e 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -17,13 +17,16 @@ apply_config: "{{ __default_config }}" no_log: true -- name: Read NodeName parameters - ansible.builtin.include_tasks: read_node_idrac.yml - when: cmpt_list - loop: "{{ cmpt_list }}" +- name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true) + ansible.builtin.set_fact: + conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}" + when: + - skip_merge | default(false) + - configs_input is defined - name: Build slurm.conf ansible.builtin.include_tasks: build_slurm_conf.yml + when: "'slurm' in conf_files" - name: Slurm dbd opts ansible.builtin.set_fact: @@ -167,12 +170,13 @@ - name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd ansible.builtin.set_fact: conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}" + when: slurm_conf_dict is defined - name: Write merged .conf ansible.builtin.copy: content: "{{ item.ini_lines | join('\n') }}\n" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - mode: "0640" + mode: "{{ slurm_dbd_mode if item.item.key == 'slurmdbd' else slurm_mode }}" owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index e4ac760d77..b68bcbbded 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -60,6 +60,7 @@ ansible.builtin.set_fact: cluster_name: "{{ slurm_cluster[0].cluster_name }}" configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}" + skip_merge: "{{ slurm_cluster[0].skip_merge | default(false) }}" slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" controller_trackfile_path: "{{ share_path }}/ctld_track" diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml new file mode 100644 index 0000000000..da1c41d3fe --- /dev/null +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -0,0 +1,109 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Check if node exists in Slurm cluster + ansible.builtin.command: scontrol show node {{ node_to_remove }} + register: node_exists_check + failed_when: false + ignore_unreachable: true + changed_when: false + delegate_to: "{{ ctld }}" + +- name: Skip if node does not exist + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} not found in cluster, skipping removal" + when: + - node_exists_check is reachable + - node_exists_check.rc != 0 + +- name: Process node removal + when: + - node_exists_check is reachable + - node_exists_check.rc == 0 + ignore_unreachable: true + block: + - name: Get current job count on node + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l + register: current_jobs + changed_when: false + delegate_to: "{{ ctld }}" + + - name: Display job information + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)" + + - name: Drain the node to prevent new job assignments + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DRAIN + Reason="Scheduled removal - waiting for jobs to complete" + changed_when: true + delegate_to: "{{ ctld }}" + + - name: Wait for all jobs to complete on the node + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l + register: job_count_check + until: job_count_check.stdout | int == 0 + retries: "{{ (node_drain_timeout / node_drain_delay) | int }}" + delay: "{{ node_drain_delay }}" + changed_when: false + delegate_to: "{{ ctld }}" + when: current_jobs.stdout | int > 0 + + - name: Confirm jobs completed + ansible.builtin.debug: + msg: "All jobs on {{ node_to_remove }} have completed" + when: current_jobs.stdout | int > 0 + + - name: Log node removal + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state" + + rescue: + - name: Log node removal failure + ansible.builtin.debug: + msg: "Failed to drain node {{ node_to_remove }}" + + - name: Remove slurm node with running job after timeout + ansible.builtin.pause: + prompt: | + Node {{ node_to_remove }} has been DRAINED to prevent new job assignments. + Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds. + Options: + 1. Press Ctrl+C then 'A' to abort + 2. Press Enter to force removal (jobs will be killed) + when: not force_scancel_node + + - name: Force cancel jobs if timeout reached + ansible.builtin.command: scancel -f -w {{ node_to_remove }} + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + + always: + - name: Set node to DOWN state + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DOWN + Reason="Node removed from cluster" + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + when: node_exists_check.rc == 0 diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index 4dc0217559..ba93bb086a 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -30,7 +30,7 @@ - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': cmpt_list | join(',')}) if item.PartitionName == slurm_partition_name else item] }}" + + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: - "'slurm' in conf_merge_dict" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 1593f791cb..d708eb0777 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -68,6 +68,7 @@ gpu_slurm_conf: SlurmdParameters: l3cache_as_socket innodb_buffer_pool_size: 4G innodb_lock_wait_timeout: 900 +conf_server: "--conf-server {{ ctld_list | join(',') }}" # TODO tmp nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" bmc_username: "{{ hostvars['localhost']['bmc_username'] }}" @@ -117,12 +118,15 @@ munge_dir_mode: "0700" common_mode: "0755" slurm_dbd_mode: "0600" slurm_db_cnf_mode: "0600" +node_drain_timeout: 900 +node_drain_delay: 30 +force_scancel_node: false dbd_slurm_conf: AccountingStoragePort: "{{ slurm_dbd_port }}" AccountingStorageType: accounting_storage/slurmdbd partition_params: PartitionName: "{{ slurm_partition_name }}" - Nodes: "{{ cmpt_list | join(',') }}" + Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}" MaxTime: "INFINITE" State: "UP" Default: "YES" diff --git a/input/omnia_config.yml b/input/omnia_config.yml index bb5a4f06fa..943d70e530 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -27,6 +27,15 @@ # Storage name corresponding to the NFS share to be used by slurm cluster # This should match with exactly with a entry in storage_config.yml +# skip_merge +# Variable indicates whether a specific configuration file path +# under config_sources should be used as-is without merging +# If skip_merge is set to true for a configuration source path, +# that configuration file will be applied directly +# without merging with defaults or existing configurations +# It accepts true and false values +# Default value is false + # config_sources # defines how the Slurm configuration files are provided to the cluster. # : @@ -50,6 +59,7 @@ slurm_cluster: - cluster_name: slurm_cluster nfs_storage_name: nfs_slurm + # skip_merge: true # config_sources: # slurm: # SlurmctldTimeout: 60